In [18]:
import pandas as pd
import numpy as np
import h2o
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import shap

In [None]:
h2o.init()

In [20]:
from h2o.automl import H2OAutoML
from h2o.frame import H2OFrame

In [21]:
df = pd.read_csv("Data Sheet 4.csv")
train_set = pd.read_csv("Data Sheet 2.csv")
test_set = pd.read_csv("Data Sheet 3.csv")

In [None]:
print(train_set.shape)
print(test_set.shape)

In [23]:
train_set = train_set.drop(columns = ['year', 'country', 'Unnamed: 0'])
test_set = test_set.drop(columns = ['year', 'country', 'Unnamed: 0'])

#### Polynomialize features

In [24]:
exclude_columns = ['gini']

In [25]:
train_set_poly = train_set.drop(columns=exclude_columns)
test_set_poly = test_set.drop(columns=exclude_columns)
train_set_untouched = train_set[exclude_columns]
test_set_untouched = test_set[exclude_columns] 

In [26]:
poly = PolynomialFeatures(degree=2, include_bias=False)
train_set_poly_transformed = poly.fit_transform(train_set_poly)
test_set_poly_transformed = poly.fit_transform(test_set_poly)

In [27]:
train_set_poly_transformed = pd.DataFrame(train_set_poly_transformed, columns=poly.get_feature_names_out(train_set_poly.columns))
test_set_poly_transformed = pd.DataFrame(test_set_poly_transformed, columns=poly.get_feature_names_out(test_set_poly.columns))

In [28]:
train_set_final = pd.concat([train_set_poly_transformed, train_set_untouched.reset_index(drop=True)], axis=1)
test_set_final = pd.concat([test_set_poly_transformed, test_set_untouched.reset_index(drop=True)], axis=1)

In [29]:
train_set_final.columns = [col.replace(' ', '*') for col in train_set_final.columns]
test_set_final.columns = [col.replace(' ', '*') for col in test_set_final.columns]

#### H2O

In [None]:
train_h2o = H2OFrame(train_set_final)

In [None]:
test_h2o = H2OFrame(test_set_final)

In [32]:
x = train_h2o.columns
y = "gini"
x.remove(y)

In [None]:
aml = H2OAutoML(max_models=20, max_runtime_secs=12000)
aml.train(x=x, y=y, training_frame=train_h2o)

In [None]:
lb = aml.leaderboard
lb.head(rows=15)

In [None]:
aml.leader #Best model

In [None]:
pred = aml.predict(test_h2o)
actual = test_h2o[y].as_data_frame().values.flatten()
pred2 = pred[0].as_data_frame().values.flatten()

In [None]:
# Get the performance of the model on the test set
perf = aml.leader.model_performance(test_data=test_h2o)

# You can now print various performance metrics
r_squared = perf.r2()
mse = perf.mse()
rmse = perf.rmse()

print(f"R-squared: {r_squared}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

In [26]:
test_h2o = test_h2o.cbind(pred)

In [27]:
test_h2o["residual"] = test_h2o[y] - test_h2o['predict']

In [None]:
# Create scatterplot
plt.figure(figsize=(8, 6))
min_val = min(min(actual), min(pred2))
max_val = max(max(actual), max(pred2))
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', linewidth=2, label="y = x")
plt.scatter(actual, pred2, alpha=0.6)
plt.title("Predicted vs Actual Values")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.grid(True)
plt.show()

In [None]:
predicted = test_h2o["predict"].as_data_frame()
residuals = test_h2o["residual"].as_data_frame()

# Scatter plot
plt.scatter(predicted, residuals, alpha=0.5)
plt.axhline(0, color="red", linestyle="--")
plt.title("Residuals vs Predicted Values")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.show()

In [None]:
feature = test_h2o["gini"].as_data_frame()
plt.scatter(feature, residuals, alpha=0.5)
plt.axhline(0, color="red", linestyle="--")
plt.title("Residuals vs Actual")
plt.xlabel("Actual")
plt.ylabel("Residuals")
plt.show()

In [None]:
exa = aml.explain(test_h2o)

In [None]:
residuals = test_data['actual'] - test_data['predicted']
sns.histplot(residuals)  # Check residual distribution