In [1]:
import pandas as pd
import evalml
import woodwork as ww
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from featuretools.selection import remove_low_information_features, remove_highly_null_features, remove_single_value_features, remove_highly_correlated_features

In [14]:
df = pd.read_csv('merged.csv')
df = df.drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,country,isonum,isocode,year,rank_educ,educ,qslnl,secvocp,teryrs,edqualsc,...,envi,fwateruse,airpollution,wasterwater,fishstock,marineprotect,improved_drinkwater,pestreg,terrestprotect,prosperity_score
0,Afghanistan,4,AFG,2007,143,22.4944,-7.178585,0.714436,0.207426,337.589355,...,39.293343,43.011665,0.14,0.0,35.323078,13.537368,42.6,0,0.366882,37.658975
1,Angola,24,AGO,2007,125,35.916668,-7.178585,39.072861,0.099019,280.802673,...,47.982162,0.476892,0.025,0.0,4.5,0.066623,46.4,3,12.399823,43.88078
2,Albania,8,ALB,2007,72,56.318413,-7.178585,7.345947,0.127648,425.420013,...,51.049442,4.873606,0.275,5.136225,37.099178,1.624084,95.9,5,9.8044,54.893284
3,United Arab Emirates,784,ARE,2007,52,59.942955,-7.178585,0.494192,0.488107,480.829529,...,62.051479,100.0,0.1975,61.3089,30.838837,2.571971,99.6,23,5.616998,61.111192
4,Argentina,32,ARG,2007,92,51.995552,-7.178585,6.997118,0.265608,421.549591,...,64.909729,12.938356,0.005,18.0625,12.28,1.103726,97.7,23,5.466403,58.557573


In [15]:
metrics = ['educ', 'soci', 'heal', 'pers', 'busi', 'econ', 'safe', 'gove', 'envi']
ranks = ['rank_' + metric for metric in metrics]
drop = metrics + ranks + ['country', 'isonum', 'isocode', 'year', 'prosperity_score']

In [16]:
y = df['prosperity_score']

df = df.drop(drop, axis = 1)

df = remove_low_information_features(df)

df = remove_highly_null_features(df)

df = remove_single_value_features(df)

df = remove_highly_correlated_features(df)

X = df

In [17]:
problem_type = 'regression'
objective =  'auto'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


automl = evalml.automl.AutoMLSearch(X_train, y_train, problem_type=problem_type, objective = objective)

Using default limit of max_batches=1.

Generating pipelines to search over...


In [18]:
automl.search()

*****************************
* Beginning pipeline search *
*****************************

Optimizing for R2. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: catboost, xgboost, extra_trees, lightgbm, decision_tree, linear_model, random_forest



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mean Baseline Regression Pipeline        Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean R2: -0.006
High coefficient of variation (cv >= 0.2) within cross validation scores. Mean Baseline Regression Pipeline may not perform as estimated on unseen data.
Batch 1: (2/9) Decision Tree Regressor w/ Imputer       Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean R2: 0.961
Batch 1: (3/9) Extra Trees Regressor w/ Imputer         Elapsed:00:02
	Starting cross validation
	Finished cross validation - mean R2: 0.985
Batch 1: (4/9) XGBoost Regressor w/ Imputer             Elapsed:00:04
	Starting cross validation
	Finished cross validation - mean R2: 0.989
Batch 1: (5/9) CatBoost Regressor w/ Imputer            Elapsed:00:07
	Starting cross validation
	Finished cross validation - mean R2: 0.357
Batch 1: (6/9) Random Forest Regressor w/ Imputer       Elapsed:00:09
	Starting cross validation
	Finished cross validation - mean R2: 0.981


In [19]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,3,XGBoost Regressor w/ Imputer,0.989309,0.989416,17381.109248,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,7,Linear Regressor w/ Imputer + Standard Scaler,0.988505,0.989251,17367.073382,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,2,Extra Trees Regressor w/ Imputer,0.984621,0.985117,17299.224899,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,5,Random Forest Regressor w/ Imputer,0.981077,0.978632,17237.319711,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,8,Elastic Net Regressor w/ Imputer + Standard Sc...,0.978784,0.979574,17197.263879,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,6,LightGBM Regressor w/ Imputer,0.966005,0.964431,16974.041069,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,1,Decision Tree Regressor w/ Imputer,0.961312,0.953418,16892.06033,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,4,CatBoost Regressor w/ Imputer,0.357044,0.345445,6336.78705,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,0,Mean Baseline Regression Pipeline,-0.005725,-0.010274,0.0,True,{'Baseline Regressor': {'strategy': 'mean'}}


In [21]:
best_pipeline = automl.best_pipeline
imp = best_pipeline.feature_importance

In [28]:
import os
datasets = os.listdir('Data')
train_csvs = [data for data in datasets if "train" in data]
col = dict()
for df in train_csvs:
    col[df] = pd.read_csv("Data/{}".format(df), index_col = 0).columns

In [32]:
def find_csv(col_name):
    for key in col.keys():
        if col_name in col[key]:
            return key

imp['dataset'] = imp['feature'].apply(lambda x: find_csv(x))

In [43]:
pd.DataFrame(imp.groupby('dataset')['importance'].sum().sort_values(ascending = False))

Unnamed: 0_level_0,importance
dataset,Unnamed: 1_level_1
gove_train.csv,0.518028
pers_train.csv,0.156088
econ_train.csv,0.129436
educ_train.csv,0.081465
heal_train.csv,0.065779
safe_train.csv,0.024753
busi_train.csv,0.01517
envi_train.csv,0.007281
soci_train.csv,0.002


In [22]:
predictions = best_pipeline.predict(X_test)
predictions = predictions.to_series()

In [23]:
print('R2: ' + str(r2_score(y_test,  predictions)))
print('MSE: ' + str(mean_squared_error(y_test, predictions, squared = True)))
print('RMSE: ' + str(mean_squared_error(y_test, predictions, squared = False)))
print('MAE: ' + str(mean_absolute_error(y_test, predictions)))

R2: 0.990026267099184
MSE: 1.0164444676989646
RMSE: 1.0081887063932846
MAE: 0.7641922817756486


In [44]:
best_pipeline.save('properity_pipeline')

In [90]:
test = pd.read_csv('test.csv', index_col = 0)

In [91]:
test

Unnamed: 0,country,isonum,isocode,year,qslnl,secvocp,teryrs,edqualsc,gbrata,secyrs,...,logis,bband,redu,hifi,affs,ippr,insdtf,creddtf,elcostlnl,stardtf
1192,Afghanistan,4,AFG,2015,-7.178585,0.961258,0.210655,337.589355,0.341660,0.998107,...,2.069573,0.004795,26.472107,3.633919,3.852955,2.457773,23.62,45.0,8.151823,93.05
1193,Angola,24,AGO,2015,-7.178585,45.228603,0.099019,280.802673,0.360500,1.820329,...,2.542980,0.412784,31.007937,2.593719,3.236488,2.257915,0.00,5.0,6.421622,76.79
1194,Albania,8,ALB,2015,-7.178585,8.005317,0.050267,425.420013,0.056140,3.402170,...,2.770000,6.573716,20.825397,3.888581,3.737438,2.896221,63.42,65.0,6.197258,90.09
1195,United Arab Emirates,784,ARE,2015,-7.178585,1.820017,0.458515,480.829529,0.034651,3.531861,...,3.539098,11.558400,4.285714,5.053419,5.424749,5.475348,43.74,45.0,3.157000,89.98
1196,Argentina,32,ARG,2015,-3.770851,13.699941,0.280885,421.549591,0.030080,2.664423,...,2.986475,15.573113,30.333333,2.583314,2.923875,2.424609,42.87,50.0,3.214868,73.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,Vietnam,704,VNM,2016,-7.178585,19.569241,0.180243,515.666687,0.011015,2.937126,...,3.154763,6.483695,24.555556,3.913509,3.605158,3.050503,35.83,70.0,7.187355,81.25
1486,Yemen,887,YEM,2016,-7.178585,0.659351,0.087605,250.055099,0.210390,1.202865,...,2.183950,1.361715,27.396825,3.848516,2.400492,2.314462,28.08,0.0,8.184262,74.22
1487,South Africa,710,ZAF,2016,-4.003088,7.252446,0.097543,310.112213,0.079000,3.382269,...,3.431244,3.211005,9.333333,2.079679,5.310756,5.300020,64.29,60.0,6.508024,81.18
1488,Zambia,894,ZMB,2016,-7.178585,2.697613,0.029222,303.833710,0.065826,1.531926,...,2.462674,0.141622,50.555556,4.544003,3.856999,4.024301,38.96,75.0,6.467388,86.69


In [92]:
drop = ['country', 'isonum', 'isocode', 'year']
df = test.copy()
df = df.drop(drop, axis = 1)

df = remove_low_information_features(df)

df = remove_highly_null_features(df)

df = remove_single_value_features(df)

df = remove_highly_correlated_features(df)

X = df

In [93]:
test['predicted_prosperity_score']  = list(best_pipeline.predict(X).to_series())

In [94]:
import numpy as np
test['rank'] = test.index
test = test.sort_values('predicted_prosperity_score', ascending = False).reset_index()
test['ranked'] = test.index

In [97]:
test.index  = test['index']
test.drop(['index', 'rank'], axis = 1, inplace = True)
test['ranked'] += 1
test

Unnamed: 0_level_0,country,isonum,isocode,year,qslnl,secvocp,teryrs,edqualsc,gbrata,secyrs,...,redu,hifi,affs,ippr,insdtf,creddtf,elcostlnl,stardtf,predicted_prosperity_score,ranked
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1420,Libya,434,LBY,2016,-7.178585,6.839103,0.705341,405.708649,0.131403,2.762275,...,10.333333,3.590806,2.025786,1.777236,0.00,0.0,5.861640,72.58,64.421074,1
1271,Libya,434,LBY,2015,-7.178585,6.839103,0.705341,405.708649,0.060080,2.762275,...,10.333333,3.590806,2.025786,1.777236,0.00,0.0,5.861640,72.58,64.421074,2
1207,Bahrain,48,BHR,2015,-7.178585,7.871953,0.321131,469.256012,0.028818,2.685482,...,4.285714,4.343962,5.504734,4.650791,44.28,40.0,3.837299,77.09,64.061493,3
1356,Bahrain,48,BHR,2016,-7.178585,7.871953,0.321131,469.256012,0.028818,2.685482,...,4.285714,4.343962,5.504734,4.650791,44.28,40.0,3.837299,77.09,64.061493,4
1398,Croatia,191,HRV,2016,-7.178585,38.065098,0.572472,529.421326,0.027620,4.125605,...,15.111111,2.900662,3.772798,3.611939,53.92,55.0,5.759217,86.21,64.006706,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,Chad,148,TCD,2015,-7.178585,1.496614,0.002503,237.933502,0.302740,1.249626,...,13.000000,3.322210,2.903032,2.509468,28.13,30.0,8.943832,41.92,56.213028,294
1213,Central African Republic,140,CAF,2015,-7.178585,3.057812,0.048011,243.561905,0.299030,1.107464,...,21.068802,3.905057,3.078271,2.779756,28.13,30.0,9.637313,31.36,56.065792,295
1362,Central African Republic,140,CAF,2016,-7.178585,3.057812,0.048011,243.561905,0.299030,1.107464,...,21.068802,3.905057,3.078271,2.779756,28.13,30.0,9.637313,31.36,56.065792,296
1282,Mali,466,MLI,2015,-7.178585,13.434841,0.045942,245.937393,0.147130,0.594346,...,13.650000,4.272020,3.609540,2.956167,40.35,30.0,8.160204,66.05,55.714619,297


In [100]:
test[drop + ['predicted_prosperity_score', 'ranked']].to_csv('predictions.csv')