In [4]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures

In [5]:
df = pd.read_csv("Data Sheet 4.csv")
train_set = pd.read_csv("Data Sheet 2.csv")
test_set = pd.read_csv("Data Sheet 3.csv")

In [6]:
df1 = pd.concat([train_set, test_set], ignore_index=True)

In [7]:
train_set = df1[df1['year'] < 2016]
test_set = df1[df1['year'] >= 2016]

In [8]:
train_set = train_set.rename(columns={'personal': 'labour', 'corporate': 'capital'})
test_set = test_set.rename(columns={'personal': 'labour', 'corporate': 'capital'})


In [9]:
train_set = train_set.drop(columns = ['year', 'country', 'Unnamed: 0'])
test_set = test_set.drop(columns = ['year', 'country', 'Unnamed: 0'])

#### Polynomialize Features

In [10]:
exclude_columns = ['gini']

In [11]:
train_set_poly = train_set.drop(columns=exclude_columns)
test_set_poly = test_set.drop(columns=exclude_columns)
train_set_untouched = train_set[exclude_columns]
test_set_untouched = test_set[exclude_columns] 

In [12]:
poly = PolynomialFeatures(degree=2, include_bias=False)
train_set_poly_transformed = poly.fit_transform(train_set_poly)
test_set_poly_transformed = poly.fit_transform(test_set_poly)

In [13]:
train_set_poly_transformed = pd.DataFrame(train_set_poly_transformed, columns=poly.get_feature_names_out(train_set_poly.columns))
test_set_poly_transformed = pd.DataFrame(test_set_poly_transformed, columns=poly.get_feature_names_out(test_set_poly.columns))

In [14]:
train_set_final = pd.concat([train_set_poly_transformed, train_set_untouched.reset_index(drop=True)], axis=1)
test_set_final = pd.concat([test_set_poly_transformed, test_set_untouched.reset_index(drop=True)], axis=1)

#### TPOT

In [15]:
X_train = train_set.drop(columns = ['gini'])
y_train = train_set['gini']

In [16]:
X_test = test_set.drop(columns = ['gini'])
y_test = test_set['gini']

In [None]:
tpot = TPOTRegressor(verbosity = 2, config_dict = 'TPOT light')
predictor = tpot.fit(X_train, y_train)

In [18]:
y_pred = predictor.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Root Mean Squared Error (RMSE): {rmse}")

In [None]:
r2 = r2_score(y_test, y_pred)
print(r2)