<h1> <center> Economic Well Being Prediction for African Countries</center> </h1>

![Econmic Well Being Prediction for African Countries](http://governanceinnovation.org/wordpress/wp-content/uploads/2017/02/WE-Africa-LabsA4.jpg)

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import statistics
import scipy
import matplotlib.pyplot as plt 
import seaborn as sns 

import scripts.gaussianize as g 

import os 
import sys 
import numpy as np 
import pandas as pd 
from tqdm.notebook import tqdm


from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler, PowerTransformer, QuantileTransformer, FunctionTransformer, MinMaxScaler, StandardScaler, MaxAbsScaler
from kaggler.preprocessing import FrequencyEncoder

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.linear_model import Ridge

## Training Functions

In [2]:
def metric(x, y):
    return np.sqrt(mean_squared_error(x, y))

skf = KFold(n_splits = 5,shuffle=True,random_state=199)

def xgb_predict(estimator,train,label,test,estimator_name):
    mean_train = []
    mean_test_val = []
    test_pred = np.zeros(test.shape[0])
    val_pred = np.zeros(train.shape[0])
    for count, (train_index,test_index) in enumerate(skf.split(train,label)):
        x_train,x_test = train.iloc[train_index],train.iloc[test_index]
        y_train,y_test = label.iloc[train_index],label.iloc[test_index]
        print(f'========================Fold{count +1}==========================')
        estimator.fit(x_train, y_train, early_stopping_rounds = 200, eval_metric="rmse",
                           eval_set=[(x_test, y_test)],verbose=2500)
        train_predict = estimator.predict(x_train, ntree_limit = estimator.get_booster().best_ntree_limit)
        test_predict = estimator.predict(x_test, ntree_limit = estimator.get_booster().best_ntree_limit)
        val_pred[test_index] = test_predict
        test_pred+= estimator.predict(test, ntree_limit = estimator.get_booster().best_ntree_limit)
        
        print('\nTesting scores', metric(y_test,test_predict))
        print('\nTraining scores', metric(y_train,train_predict))
        mean_train.append(metric(y_train, train_predict))
        mean_test_val.append(metric(y_test,test_predict))
    print('Average Testing ROC score for 5 folds split:',np.mean(mean_test_val))
    print('Average Training ROC score for 5 folds split:',np.mean(mean_train))
    print('standard Deviation for 5 folds split:',np.std(mean_test_val))
    return val_pred, test_pred, estimator_name


def cat_predict(estimator,train,label,test,estimator_name):
    mean_train = []
    mean_test_val = []
    test_pred = np.zeros(test.shape[0])
    val_pred = np.zeros(train.shape[0])
    for count, (train_index,test_index) in enumerate(skf.split(train,label)):
        x_train,x_test = train.iloc[train_index],train.iloc[test_index]
        y_train,y_test = label.iloc[train_index],label.iloc[test_index]
        x_train = np.nan_to_num(x_train)
        y_train = np.nan_to_num(y_train)
        x_test = np.nan_to_num(x_test)
        y_test = np.nan_to_num(y_test)
        
        print(f'========================Fold{count +1}==========================')
        estimator.fit(x_train,y_train,eval_set=[(x_test,y_test)],early_stopping_rounds=200,
                           verbose=2500,use_best_model=True)
        train_predict = estimator.predict(x_train)
        test_predict = estimator.predict(x_test)
        val_pred[test_index] = test_predict
        test_pred+= estimator.predict(test)
        
        print('\nTesting scores', metric(y_test,test_predict))
        print('\nTraining scores', metric(y_train,train_predict))
        mean_train.append(metric(y_train, train_predict))
        mean_test_val.append(metric(y_test,test_predict))
    print('Average Testing ROC score for 5 folds split:',np.mean(mean_test_val))
    print('Average Training ROC score for 5 folds split:',np.mean(mean_train))
    print('standard Deviation for 5 folds split:',np.std(mean_test_val))
    return val_pred, test_pred, estimator_name


def lgb_predict(estimator,train,label,test,estimator_name):
    mean_train = []
    mean_test_val = []
    test_pred = np.zeros(test.shape[0])
    val_pred = np.zeros(train.shape[0])
    for count, (train_index,test_index) in enumerate(skf.split(train,label)):
        x_train,x_test = train.iloc[train_index].values,train.iloc[test_index].values
        y_train,y_test = label.iloc[train_index].values,label.iloc[test_index].values
        print(f'========================Fold{count +1}==========================')
        estimator.fit(x_train,y_train,eval_set=[(x_test,y_test)],early_stopping_rounds=200,
                               verbose=2500)
        train_predict = estimator.predict(x_train, num_iteration = estimator.best_iteration_)
        test_predict = estimator.predict(x_test, num_iteration = estimator.best_iteration_)
        val_pred[test_index] = test_predict
        test_pred+= estimator.predict(test, num_iteration = estimator.best_iteration_)
        
        print('\nValidation scores', metric(y_test,test_predict))
        print('\nTraining scores', metric(y_train,train_predict))
        mean_train.append(metric(y_train, train_predict))
        mean_test_val.append(metric(y_test,test_predict))
    print('Average Testing ROC score for 5 folds split:',np.mean(mean_test_val))
    print('Average Training ROC score for 5 folds split:',np.mean(mean_train))
    print('standard Deviation for 5 folds split:',np.std(mean_test_val))
    return val_pred, test_pred, estimator_name

def model_predict(estimator,train,label,test, estimator_name):
    mean_train = []
    mean_test_val = []
    test_pred = np.zeros((test.shape[0]))
    val_pred = np.zeros((train.shape[0]))
    for count, (train_index,test_index) in enumerate(skf.split(train,label)):
        x_train,x_test = train.iloc[train_index].values,train.iloc[test_index].values
        y_train,y_test = label.iloc[train_index].values,label.iloc[test_index].values
        print(f'========================Fold{count +1}==========================')
        estimator.fit(x_train, y_train)
        train_predict = estimator.predict(x_train)
        test_predict = estimator.predict(x_test)
        val_pred[test_index] = test_predict.reshape((test_predict.shape[0],))
        test_pred+= estimator.predict(test.values)
        
        print('\nValidation scores', metric(y_test,test_predict))
        print('\nTraining scores', metric(y_train,train_predict))
        mean_train.append(metric(y_train, train_predict))
        mean_test_val.append(metric(y_test,test_predict))
    print('Average Testing RMSE  for 5 folds split:',np.mean(mean_test_val))
    print('Average Training RMSE  for 5 folds split:',np.mean(mean_train))
    print('standard Deviation for 5 folds split:',np.std(mean_test_val))
    return val_pred, test_pred, estimator_name, np.mean(mean_test_val), np.mean(mean_train)

def Create_StackDataFrames(train_preds, test_preds, names):
    Train_stack = pd.concat([pd.Series(tr_pred, name=name) for tr_pred, name in zip(train_preds, names)],1)
    
    Test_stack = pd.concat([pd.Series(te_pred, name=name) for te_pred, name in zip(test_preds, names)],1)
    
    Test_stack = Test_stack/5 #average predictions for 5 folds on the Test set..
    
    return Train_stack, Test_stack
    
def Stack(meta_estimator,Train_stack,Test_stack,target,file_name):
    
    val_pred, test_pred, estimator_name, test_score, train_score = model_predict(meta_estimator,Train_stack, ytrain, Test_stack, "Ridge")
    
    prediction = test_pred/5#meta_estimator.fit(Train_stack, target).predict(Test_stack)
    ss['Target'] = prediction#np.round(np.absolute(prediction), 0)
    
    ss.Target=ss.Target.apply(abs)
    
    ss.to_csv(file_name,index=False)
#     ss.describe()
    return ss, val_pred, test_pred, estimator_name, test_score, train_score

def trainer(model_name, xtrain, ytrain, xtest):
    catboost =  CatBoostRegressor(random_seed=34,use_best_model=True,
                          n_estimators=400000,silent=True,eval_metric='RMSE')

    cat1_train, cat1_test, cat1_name = cat_predict(catboost,xtrain, ytrain, xtest,  'catboost(1)')
    
    lgb_model = LGBMRegressor(random_state=34, n_estimators=100000, colsample_bytree=0.9, min_child_samples=10, subsample=0.7,subsample_freq=2,num_leaves=120,reg_lambda=1,reg_alpha=1, metric="rmse", learning_rate=0.01, max_depth=5
                             )

    LGB1__train, LGB1_test, LGB1_name =lgb_predict(lgb_model,xtrain, ytrain, xtest,'lightgbm(1)')
    
    Train_stack1, Test_stack1 = Create_StackDataFrames([cat1_train, LGB1__train], [cat1_test, LGB1_test], [cat1_name, LGB1_name])
    
    
    meta_estimator = Ridge()
    
    ss, val_pred, test_pred, estimator_name, test_score, train_score = Stack(meta_estimator, Train_stack1, Test_stack1, ytrain, f'../Submissions/stack_{model_name}.csv') 

    return ss, val_pred, test_pred, estimator_name, test_score, train_score

## Reading Data

In [3]:
sorted(os.listdir("../Data"))

['SampleSubmission.csv', 'Test.csv', 'Train.csv']

In [4]:
ss, test, train= [pd.read_csv(f"../Data/{f}") for f in sorted(os.listdir("../Data/")) if f.endswith(".csv")]

## Preprocessing Function

**Normalization techniques**:
    
    * The Lambert W x F transformation
    * The Box Cox tranformation
    * The Yeo-Johnson transformation
    * The Ordered Quantile technique
    * The Logarithm of base `b` and default value `a` log_{b}(x+a)
    * The sqrt transformation
    * The Exponential transformation 
    * The arcsinus hyperbolic transformation 
    * Modified Box Cox (1964)
    * Manly’s Exponential (1976)
    * John/Draper’s Modulus (1980), 
    * Bickel/Doksum’s Modified Box Cox (1981)
    * Min-Max Normalization
    * Z-Score Standardization
    * Median-Max Normalization (CapitainData)
    * Feature Clipping
    * Quantile Normalization
    * Reciprocal Transformation 
    * Power Transformation 
   + Normalization by Categorical variables (country, year, or urban_or_rural)
    
 - Selecting the best technique

 **Encoding Techniques**:
 
    + Supervised:
    
        ° Target Encoding (Mean, var, min, max, std, ...)
        
        ° Bayesian Target Encoding / Conjugate Prior Encoding
        
        ° CatBoost 
        
        ° Generalized Linear Mixed Model 
        
        ° James-Stein Estimator 
        
        ° LeaveOneOut 
        
        ° M-estimator 
        
        ° Weight of Evidence
        
        ° Hierarchical Bayesian Target Encoding
        
        ° Probability Ratio
        
        ° DecisionTree Encoder
        
        ° RareLabel Encoder
        
        ° Target guided Ordinal Encoding
        
    + Unsupervised:

        ° Backward Difference Contrast 
        
        ° BaseN 
        
        ° Binary 
        
        ° Count 
        
        ° Hashing 
        
        ° Helmert Contrast 
        
        ° Ordinal 
        
        ° One-Hot 
        
        ° Label
        
        ° Polynomial Contrast 
        
        ° Sum Contrast 
        
        ° Thermometer Encoder 


**Discretisation techniques**:

        & Equal Frequency Discretiser
        & EqualWidthDiscretiser
        & ArbitraryDiscretiser 
        & DecisionTreeDiscretiser
        


**Feature Creation techniques**:

        @ MathematicalCombination
        @ CombineWithReferenceFeature
        @ CyclicalTransformer

**Feature Selection techniques**:

        p Drop Freatures
        p Drop Constant Features 
        p Drop Duplicate Features 
        p Drop Correlated Features 
        p Smart Correlated Features 
        p Select By Shuffling
        p Select By Single Feature Performance
        p Select By Target Mean Performance
        p Recursive Feature Elimination
        p Recursive Feature Addition

In [5]:
def preprocessor(train, test, normalizer=RobustScaler(), encoder=FrequencyEncoder()):
    print("\n Procesing started \n")
    to_scale =["ghsl_pop_density", "nighttime_lights", "dist_to_shoreline", "dist_to_capital"]
    
    to_scale_100 = ["landcover_crops_fraction", "landcover_urban_fraction", "landcover_water_permanent_10km_fraction", "landcover_water_seasonal_10km_fraction"]
    
    train[to_scale_100]=train[to_scale_100]/100
    test[to_scale_100]=test[to_scale_100]/100
    
#     scaler=RobustScaler()#MinMaxScaler()
    normalizer.fit(train[to_scale].apply(lambda x: x+8))
    train[to_scale]=normalizer.transform(train[to_scale].apply(lambda x: x+8))
    test[to_scale]=normalizer.transform(test[to_scale].apply(lambda x: x+8))
    
    ntrain=train.shape[0]
    train_ids=train.ID.unique()
    test_ids=test.ID.unique() 
    all_data=pd.concat([train, test], axis=0)
    
    cat_cols = ["country", "urban_or_rural", "year"]
    float_cols = train.columns.difference(cat_cols+["Target", "ID"])
    
    all_data.year=all_data.year.apply(str)
    
#     fe = FrequencyEncoder() 
    
    all_data[cat_cols]=encoder.fit_transform(all_data[cat_cols])
    
    train = all_data.loc[all_data.ID.isin(train_ids)]#all_data[:ntrain]
    test = all_data.loc[all_data.ID.isin(test_ids)]#all_data[ntrain:]
    
    main_cols=train.columns.difference(["ID", 'Target'])
    xtrain = train[main_cols]
    xtest = test[main_cols]
    ytrain=train.Target 
    
    print("Processing completed \n")
    return xtrain, ytrain, xtest

In [6]:
xtrain, ytrain, xtest = preprocessor(train, test)

ss, val_pred, test_pred, estimator_name, test_score, train_score = trainer("baseline", xtrain, ytrain, xtest)

ss.describe()


 Procesing started 

Processing completed 

Learning rate set to 0.001887
0:	learn: 0.1943936	test: 0.1931049	best: 0.1931049 (0)	total: 55.2ms	remaining: 6h 7m 40s
2500:	learn: 0.0932023	test: 0.0935957	best: 0.0935957 (2500)	total: 14.9s	remaining: 39m 31s
5000:	learn: 0.0878585	test: 0.0895007	best: 0.0895007 (5000)	total: 28.2s	remaining: 37m 6s
7500:	learn: 0.0846477	test: 0.0878726	best: 0.0878726 (7500)	total: 40.9s	remaining: 35m 39s
10000:	learn: 0.0823027	test: 0.0870770	best: 0.0870770 (10000)	total: 53.6s	remaining: 34m 48s
12500:	learn: 0.0804954	test: 0.0866034	best: 0.0866034 (12500)	total: 1m 6s	remaining: 34m 6s
15000:	learn: 0.0789069	test: 0.0862715	best: 0.0862715 (15000)	total: 1m 19s	remaining: 33m 57s
17500:	learn: 0.0774551	test: 0.0860134	best: 0.0860134 (17500)	total: 1m 50s	remaining: 40m 23s
20000:	learn: 0.0761520	test: 0.0858325	best: 0.0858321 (19998)	total: 2m 3s	remaining: 39m 7s
22500:	learn: 0.0749450	test: 0.0857047	best: 0.0857047 (22500)	total: 2m

[2500]	valid_0's rmse: 0.0863529
Early stopping, best iteration is:
[3475]	valid_0's rmse: 0.0862236

Validation scores 0.08622363499451881

Training scores 0.07172311045613941
Training until validation scores don't improve for 200 rounds
[2500]	valid_0's rmse: 0.0868272
Early stopping, best iteration is:
[4136]	valid_0's rmse: 0.0865136

Validation scores 0.08651356009028356

Training scores 0.07018485693458079
Training until validation scores don't improve for 200 rounds
[2500]	valid_0's rmse: 0.0874155
[5000]	valid_0's rmse: 0.0868777
Early stopping, best iteration is:
[5314]	valid_0's rmse: 0.08685

Validation scores 0.08685004029931383

Training scores 0.06766671840777502
Training until validation scores don't improve for 200 rounds
[2500]	valid_0's rmse: 0.0858238
Early stopping, best iteration is:
[4062]	valid_0's rmse: 0.0854992

Validation scores 0.08549918845916478

Training scores 0.0705489904226157
Training until validation scores don't improve for 200 rounds
[2500]	valid_0

Unnamed: 0,Target
count,7194.0
mean,0.327742
std,0.175333
min,0.080906
25%,0.182593
50%,0.262342
75%,0.477112
max,0.77257
