In [2]:
import pandas as pd 
from pathlib import Path
from plotly import express as px
data_dir= Path("data")

gdp_file=Path(data_dir / "gdp_per_capita.csv")
oecd_file=Path(data_dir / "oecd_bli_2015.csv")

gdp_df= pd.read_csv(gdp_file,thousands=',', delimiter='\t',encoding="latin1",na_values="n/a")
oecd_df= pd.read_csv(oecd_file, thousands=',')

In [None]:
import numpy as np 
from sklearn.linear_model import LinearRegression


def prepare_country_stats(oecd_df, gdp_df):
    oecd_countries= oecd_df[oecd_df["INEQUALITY"]=="TOT"]
    oecd_countries= oecd_countries.pivot(index="Country", columns="Indicator", values="Value")
    gdp_per_capita= gdp_df[["Country", "2015"]]
    gdp_per_capita= gdp_per_capita.rename(columns={"2015": "GDP per capita"})
    gdp_per_capita= gdp_per_capita.set_index("Country")
    full_country_stats= pd.merge(left=oecd_countries, right=gdp_per_capita, left_index=True, right_index=True)
    full_country_stats= full_country_stats.sort_values(by="GDP per capita")
    remove_indices= [0, 1, 6, 8, 33, 34, 35]
    keep_indices= list(set(range(36)) - set(remove_indices))
    return full_country_stats[["GDP per capita","Life satisfaction"]].iloc[keep_indices]

country_stats=prepare_country_stats(oecd_df, gdp_df)
X= np.c_[country_stats["GDP per capita"]]
y= np.c_[country_stats["Life satisfaction"]]

X_new = [[22587]]
model= LinearRegression()
model.fit(X, y)
print(model.predict(X_new))  # Predict life satisfaction for GDP per capita of 22,

[[5.96242338]]


In [4]:
print(model.intercept_)
print(model.coef_)

[4.8530528]
[[4.91154459e-05]]


In [8]:
from sklearn.neighbors import KNeighborsRegressor
X_new = [[20732]]
knn_model= KNeighborsRegressor(n_neighbors=3)
knn_model.fit(X, y)
print(knn_model.predict(X_new)) 

[[5.2]]


In [2]:
import pandas as pd
model_comparison=pd.read_json("model_comparison_results.json")
# view model comparison results sort by Test R2 score
model_comparison.sort_values(by="Test R2", ascending=False)

Unnamed: 0,Pipeline,Model,Train R2,Test R2,Test R2 Std,Test RMSE,Test MAE,Best Params
0,pipe2_no_grouper,RandomForest,0.846498,0.762681,0.077233,0.352976,0.177295,"{'model__max_depth': 15, 'model__min_samples_l..."
1,pipe5_ratio_scaled,RandomForest,0.847582,0.760889,0.072014,0.354819,0.175598,"{'model__max_depth': 20, 'model__min_samples_l..."
2,pipe2_optimized,RandomForest,0.84435,0.756838,0.072766,0.357777,0.176294,"{'model__max_depth': 15, 'model__min_samples_l..."
3,pipe1_optimized,RandomForest,0.841507,0.752839,0.080601,0.360155,0.181218,"{'model__max_depth': 15, 'model__min_samples_l..."
4,pipe7_rare_moderate,RandomForest,0.841443,0.752481,0.084134,0.36016,0.181462,"{'model__max_depth': 15, 'model__min_samples_l..."
5,pipe4_target_enc,RandomForest,0.839194,0.752423,0.067725,0.361477,0.178866,"{'model__max_depth': 15, 'model__min_samples_l..."
6,pipe3_with_grouper,RandomForest,0.828412,0.751423,0.071241,0.361938,0.181519,"{'model__max_depth': 20, 'model__min_samples_l..."
7,pipe1_optimized_scaled,RandomForest,0.842851,0.75105,0.072324,0.362202,0.182591,"{'model__max_depth': 15, 'model__min_samples_l..."
8,catboost_pipe_nogroup,CatBoost,0.858797,0.750937,0.072286,0.362168,0.197576,"{'model__depth': 8, 'model__iterations': 500, ..."
9,pipe6_poly,RandomForest,0.856238,0.747454,0.077395,0.364307,0.184053,"{'model__max_depth': 15, 'model__min_samples_l..."


In [5]:
# plot model comparison by model and pipeline
fig= px.bar(model_comparison, x=model_comparison.index, y="Test R2", color="Pipeline", barmode="group",
            title="Model Comparison by Pipeline",
            labels={"x": "Model", "Test R2": "Test R2 Score"})
fig.show()