In [1]:
import pandas as pd
import evalml
import woodwork as ww
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from featuretools.selection import remove_low_information_features, remove_highly_null_features, remove_single_value_features, remove_highly_correlated_features

In [14]:
df = pd.read_csv('merged.csv')
df = df.drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,country,isonum,isocode,year,rank_educ,educ,qslnl,secvocp,teryrs,edqualsc,...,envi,fwateruse,airpollution,wasterwater,fishstock,marineprotect,improved_drinkwater,pestreg,terrestprotect,prosperity_score
0,Afghanistan,4,AFG,2007,143,22.4944,-7.178585,0.714436,0.207426,337.589355,...,39.293343,43.011665,0.14,0.0,35.323078,13.537368,42.6,0,0.366882,37.658975
1,Angola,24,AGO,2007,125,35.916668,-7.178585,39.072861,0.099019,280.802673,...,47.982162,0.476892,0.025,0.0,4.5,0.066623,46.4,3,12.399823,43.88078
2,Albania,8,ALB,2007,72,56.318413,-7.178585,7.345947,0.127648,425.420013,...,51.049442,4.873606,0.275,5.136225,37.099178,1.624084,95.9,5,9.8044,54.893284
3,United Arab Emirates,784,ARE,2007,52,59.942955,-7.178585,0.494192,0.488107,480.829529,...,62.051479,100.0,0.1975,61.3089,30.838837,2.571971,99.6,23,5.616998,61.111192
4,Argentina,32,ARG,2007,92,51.995552,-7.178585,6.997118,0.265608,421.549591,...,64.909729,12.938356,0.005,18.0625,12.28,1.103726,97.7,23,5.466403,58.557573


In [15]:
metrics = ['educ', 'soci', 'heal', 'pers', 'busi', 'econ', 'safe', 'gove', 'envi']
ranks = ['rank_' + metric for metric in metrics]
drop = metrics + ranks + ['country', 'isonum', 'isocode', 'year', 'prosperity_score']

In [16]:
y = df['prosperity_score']

df = df.drop(drop, axis = 1)

df = remove_low_information_features(df)

df = remove_highly_null_features(df)

df = remove_single_value_features(df)

df = remove_highly_correlated_features(df)

X = df

In [17]:
problem_type = 'regression'
objective =  'auto'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


automl = evalml.automl.AutoMLSearch(X_train, y_train, problem_type=problem_type, objective = objective)

Using default limit of max_batches=1.

Generating pipelines to search over...


In [18]:
automl.search()

*****************************
* Beginning pipeline search *
*****************************

Optimizing for R2. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: catboost, xgboost, extra_trees, lightgbm, decision_tree, linear_model, random_forest



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mean Baseline Regression Pipeline        Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean R2: -0.006
High coefficient of variation (cv >= 0.2) within cross validation scores. Mean Baseline Regression Pipeline may not perform as estimated on unseen data.
Batch 1: (2/9) Decision Tree Regressor w/ Imputer       Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean R2: 0.961
Batch 1: (3/9) Extra Trees Regressor w/ Imputer         Elapsed:00:02
	Starting cross validation
	Finished cross validation - mean R2: 0.985
Batch 1: (4/9) XGBoost Regressor w/ Imputer             Elapsed:00:04
	Starting cross validation
	Finished cross validation - mean R2: 0.989
Batch 1: (5/9) CatBoost Regressor w/ Imputer            Elapsed:00:07
	Starting cross validation
	Finished cross validation - mean R2: 0.357
Batch 1: (6/9) Random Forest Regressor w/ Imputer       Elapsed:00:09
	Starting cross validation
	Finished cross validation - mean R2: 0.981


In [19]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,3,XGBoost Regressor w/ Imputer,0.989309,0.989416,17381.109248,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,7,Linear Regressor w/ Imputer + Standard Scaler,0.988505,0.989251,17367.073382,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,2,Extra Trees Regressor w/ Imputer,0.984621,0.985117,17299.224899,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,5,Random Forest Regressor w/ Imputer,0.981077,0.978632,17237.319711,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,8,Elastic Net Regressor w/ Imputer + Standard Sc...,0.978784,0.979574,17197.263879,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,6,LightGBM Regressor w/ Imputer,0.966005,0.964431,16974.041069,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,1,Decision Tree Regressor w/ Imputer,0.961312,0.953418,16892.06033,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,4,CatBoost Regressor w/ Imputer,0.357044,0.345445,6336.78705,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,0,Mean Baseline Regression Pipeline,-0.005725,-0.010274,0.0,True,{'Baseline Regressor': {'strategy': 'mean'}}


In [21]:
best_pipeline = automl.best_pipeline
imp = best_pipeline.feature_importance

In [28]:
import os
datasets = os.listdir('Data')
train_csvs = [data for data in datasets if "train" in data]
col = dict()
for df in train_csvs:
    col[df] = pd.read_csv("Data/{}".format(df), index_col = 0).columns

In [32]:
def find_csv(col_name):
    for key in col.keys():
        if col_name in col[key]:
            return key

imp['dataset'] = imp['feature'].apply(lambda x: find_csv(x))

In [43]:
pd.DataFrame(imp.groupby('dataset')['importance'].sum().sort_values(ascending = False))

Unnamed: 0_level_0,importance
dataset,Unnamed: 1_level_1
gove_train.csv,0.518028
pers_train.csv,0.156088
econ_train.csv,0.129436
educ_train.csv,0.081465
heal_train.csv,0.065779
safe_train.csv,0.024753
busi_train.csv,0.01517
envi_train.csv,0.007281
soci_train.csv,0.002


In [22]:
predictions = best_pipeline.predict(X_test)
predictions = predictions.to_series()

In [23]:
print('R2: ' + str(r2_score(y_test,  predictions)))
print('MSE: ' + str(mean_squared_error(y_test, predictions, squared = True)))
print('RMSE: ' + str(mean_squared_error(y_test, predictions, squared = False)))
print('MAE: ' + str(mean_absolute_error(y_test, predictions)))

R2: 0.990026267099184
MSE: 1.0164444676989646
RMSE: 1.0081887063932846
MAE: 0.7641922817756486


In [44]:
best_pipeline.save('propensity_pipeline')