In [1]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import graphviz
import lime

In [2]:
# Import the data
X_preprocessed = pd.read_csv('data/training_data_independent.csv', index_col='id')
y_preprocessed = pd.read_csv('data/training_data_labels.csv', index_col='id')
X_testing = pd.read_csv('data/testing_data_independent.csv', index_col='id')

# Concatenate dependent and independent variables
data = pd.concat([X_preprocessed, y_preprocessed.status_group], axis=1)

# columns to label encode
label_encode_cols = ['permit', 'management', 'payment', 'water_quality', 'quantity', 'source', 
                     'waterpoint_type', 'extraction_type']

#Create competition training dataframe
comp = data.copy()
col_maps = {}
# instantiate label encoder and execute
le = LabelEncoder()
for col in label_encode_cols:
    le.fit(data[col])
    comp.loc[:,col] = le.transform(comp[col])
    col_maps[col] = dict(zip(le.transform(le.classes_), le.classes_))
    if col != 'status_group':
        X_testing.loc[:,col] = le.transform(X_testing[col])
        
X = comp.drop(columns=['status_group'])
y = comp.status_group.copy()

# identify

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
#X_test = replace_unique(X_train, X_test, high_card_cols, 'other')
X_test = X_test.copy()

In [3]:
def target_encode_columns(train, test, features, target):
    df_train = train.copy()
    df_test = test.copy()
    
    n = len(df_train)
    labels = sorted(list(df_train[target].unique()))

    # Get dataset averages of each target class
    label1_avg = len(df_train.loc[df_train[target] == labels[0]])/n
    label2_avg = len(df_train.loc[df_train[target] == labels[1]])/n
    label3_avg = len(df_train.loc[df_train[target] == labels[2]])/n
    
    # Get target dummies and list of new target dummy variable columns names
    df_train1 = pd.get_dummies(df_train, columns=[target])
    targets = list(df_train1.columns[-3:])
    
    for col in features:
        print('Encoding', col, end='... ')
        
        # Get list of feature classes and list of their class counts
        unique_c = list(df_train1.groupby(by=col).mean().index)
        counts = list(df_train1.groupby(by=col).count()[targets[0]])

        # Get list of each target class' mean for the feature's unique classes
        target0_means = list(df_train1.groupby(by=col).mean()[targets[0]])
        target1_means = list(df_train1.groupby(by=col).mean()[targets[1]])
        target2_means = list(df_train1.groupby(by=col).mean()[targets[2]])
      
        # Initialize new training and testing columns for the feature's target encoded columns
        df_train1[col+'_functional'], df_train1[col+'_repair'], df_train1[col+'_nonfunctional'] = 0, 0, 0
        df_test[col+'_functional'], df_test[col+'_repair'], df_test[col+'_nonfunctional'] = 0, 0, 0
        
        # For each unique class of the feature, calculate smoothed target mean (one for each of the 3 target classes)        
        for i, c in enumerate(unique_c):
            if counts[i] <= 10:
                target0_smoothed = (counts[i]-1)/10 * target0_means[i] + (11-counts[i])/10 * label1_avg
                target1_smoothed = (counts[i]-1)/10 * target1_means[i] + (11-counts[i])/10 * label2_avg
                target2_smoothed = (counts[i]-1)/10 * target2_means[i] + (11-counts[i])/10 * label2_avg
            else:
                target0_smoothed = target0_means[i]
                target1_smoothed = target1_means[i]
                target2_smoothed = target2_means[i]
            
            # Assign that value to each to each instance which has the corresponding feature value
            df_train1.loc[df_train1[col]==c,col+'_functional'] = target0_smoothed
            df_train1.loc[df_train1[col]==c,col+'_repair'] = target1_smoothed
            df_train1.loc[df_train1[col]==c,col+'_nonfunctional'] = target2_smoothed
            
            
            if c in df_test[col].unique():
                df_test.loc[df_test[col]==c, col+'_functional'] = target0_smoothed
                df_test.loc[df_test[col]==c, col+'_repair'] = target1_smoothed
                df_test.loc[df_test[col]==c, col+'_nonfunctional'] = target2_smoothed

        print('complete.')
        
    df_train1.drop(columns=features+targets, inplace=True)
    df_test.drop(columns=features, inplace=True)
    return df_train1, df_test

In [4]:
X_train = pd.concat([X_train, y_train], axis=1)
X_train1, X_test1 = target_encode_columns(X_train, X_test, ['ward', 'lga'], 'status_group')

Encoding ward... complete.
Encoding lga... complete.


In [7]:
X_train1.status_group

AttributeError: 'DataFrame' object has no attribute 'status_group'

In [6]:
X_test1.columns

Index(['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer',
       'longitude', 'latitude', 'wpt_name', 'num_private', 'basin',
       'subvillage', 'region', 'region_code', 'district_code', 'population',
       'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name',
       'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group',
       'ward_functional', 'ward_repair', 'ward_nonfunctional',
       'lga_functional', 'lga_repair', 'lga_nonfunctional'],
      dtype='object')

In [6]:
df_train0 = X_train
df_test0 = X_test
features = ['lga', 'ward']
target='status_group'
df_train = df_train0.copy()
df_test = df_test0.copy()

n = len(df_train)
labels = sorted(list(df_train[target].unique()))

# Get dataset averages of each target class
label1_avg = len(df_train.loc[df_train[target] == labels[0]])/n
label2_avg = len(df_train.loc[df_train[target] == labels[1]])/n
label3_avg = len(df_train.loc[df_train[target] == labels[2]])/n

# Get target dummies and list of new target dummy variable columns names
df_train1 = pd.get_dummies(df_train, columns=[target])
targets = list(df_train1.columns[-3:])

In [7]:
df_train1

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group_functional,status_group_functional needs repair,status_group_non functional
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34470,0.0,2011-07-29,Hesawa,0,DWE,33.037573,-2.503828,Kwa Mbisu,0,Lake Victoria,...,2,insufficient,7,shallow well,groundwater,6,other,0,0,1
55171,200.0,2013-01-20,Rvemp,1155,DWE,33.378965,-2.154466,Kwachisaku Mwndu,0,Lake Victoria,...,1,enough,7,shallow well,groundwater,4,hand pump,0,0,1
72157,0.0,2011-04-09,Government Of Tanzania,0,Central Government,33.429493,-9.026036,Church Of God,0,Lake Rukwa,...,3,seasonal,5,rainwater harvesting,surface,1,communal standpipe,1,0,0
53472,0.0,2011-03-12,Amref,13,AMREF,39.213282,-7.211929,Kwa Mbwela,0,Wami / Ruvu,...,1,enough,7,shallow well,groundwater,4,hand pump,1,0,0
14717,0.0,2011-03-02,,-37,,39.655339,-7.916720,Kwa Kasimu,0,Rufiji,...,1,enough,3,borehole,groundwater,1,communal standpipe,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19959,0.0,2012-10-10,Dwsp,0,DWE,33.469236,-3.607610,Shuleni,0,Internal,...,3,seasonal,5,rainwater harvesting,surface,1,communal standpipe,0,1,0
31378,0.0,2013-01-31,Finw,279,FinW,39.587734,-10.682716,Shuleni,0,Ruvuma / Southern Coast,...,0,dry,3,borehole,groundwater,2,communal standpipe,0,0,1
73223,0.0,2011-03-04,Rc Ch,1736,RC Ch,34.875985,-9.591650,none,0,Lake Nyasa,...,1,enough,8,spring,groundwater,1,communal standpipe,1,0,0
49904,0.0,2013-03-20,World Vision,1322,Community,36.797282,-3.355298,Nikodemo Risa,0,Pangani,...,1,enough,8,spring,groundwater,1,communal standpipe,0,0,1


In [16]:
#for col in features:
col = 'ward'

In [17]:
# Get list of feature classes and list of their class counts
unique_c = list(df_train1.groupby(by=col).mean().index)
counts = list(df_train1.groupby(by=col).count()[targets[0]])

# Get list of each target class' mean for the feature's unique classes
target0_means = list(df_train1.groupby(by=col).mean()[targets[0]])
target1_means = list(df_train1.groupby(by=col).mean()[targets[1]])
target2_means = list(df_train1.groupby(by=col).mean()[targets[2]])

# Create dictionary to map the above lists together
my_dict = {'classes':unique_c,
           target[0]:target0_means,
           target[1]: target1_means,
           target[2]:target2_means,
}

# Initialize new training and testing columns for the feature's target encoded columns
df_train1[col+'_functional'], df_train1[col+'_repair'], df_train1[col+'_nonfunctional'] = 0, 0, 0
df_test[col+'_functional'], df_test[col+'_repair'], df_test[col+'_nonfunctional'] = 0, 0, 0

In [18]:
df_train1

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,waterpoint_type_group,status_group_functional,status_group_functional needs repair,status_group_non functional,lga_functional,lga_repair,lga_nonfunctional,ward_functional,ward_repair,ward_nonfunctional
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34470,0.0,2011-07-29,Hesawa,0,DWE,33.037573,-2.503828,Kwa Mbisu,0,Lake Victoria,...,other,0,0,1,0.334906,0.086478,0.578616,0,0,0
55171,200.0,2013-01-20,Rvemp,1155,DWE,33.378965,-2.154466,Kwachisaku Mwndu,0,Lake Victoria,...,hand pump,0,0,1,0.310976,0.009146,0.679878,0,0,0
72157,0.0,2011-04-09,Government Of Tanzania,0,Central Government,33.429493,-9.026036,Church Of God,0,Lake Rukwa,...,communal standpipe,1,0,0,0.624277,0.057803,0.317919,0,0,0
53472,0.0,2011-03-12,Amref,13,AMREF,39.213282,-7.211929,Kwa Mbwela,0,Wami / Ruvu,...,hand pump,1,0,0,0.476658,0.007371,0.515971,0,0,0
14717,0.0,2011-03-02,,-37,,39.655339,-7.916720,Kwa Kasimu,0,Rufiji,...,communal standpipe,1,0,0,0.712871,0.099010,0.188119,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19959,0.0,2012-10-10,Dwsp,0,DWE,33.469236,-3.607610,Shuleni,0,Internal,...,communal standpipe,0,1,0,0.603896,0.045455,0.350649,0,0,0
31378,0.0,2013-01-31,Finw,279,FinW,39.587734,-10.682716,Shuleni,0,Ruvuma / Southern Coast,...,communal standpipe,0,0,1,0.287879,0.045455,0.666667,0,0,0
73223,0.0,2011-03-04,Rc Ch,1736,RC Ch,34.875985,-9.591650,none,0,Lake Nyasa,...,communal standpipe,1,0,0,0.802017,0.038217,0.159766,0,0,0
49904,0.0,2013-03-20,World Vision,1322,Community,36.797282,-3.355298,Nikodemo Risa,0,Pangani,...,communal standpipe,0,0,1,0.647292,0.034346,0.318362,0,0,0


In [13]:
df_train1

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group_functional,status_group_functional needs repair,status_group_non functional,lga_functional,lga_repair,lga_nonfunctional
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34470,0.0,2011-07-29,Hesawa,0,DWE,33.037573,-2.503828,Kwa Mbisu,0,Lake Victoria,...,shallow well,groundwater,6,other,0,0,1,0,0,0
55171,200.0,2013-01-20,Rvemp,1155,DWE,33.378965,-2.154466,Kwachisaku Mwndu,0,Lake Victoria,...,shallow well,groundwater,4,hand pump,0,0,1,0,0,0
72157,0.0,2011-04-09,Government Of Tanzania,0,Central Government,33.429493,-9.026036,Church Of God,0,Lake Rukwa,...,rainwater harvesting,surface,1,communal standpipe,1,0,0,0,0,0
53472,0.0,2011-03-12,Amref,13,AMREF,39.213282,-7.211929,Kwa Mbwela,0,Wami / Ruvu,...,shallow well,groundwater,4,hand pump,1,0,0,0,0,0
14717,0.0,2011-03-02,,-37,,39.655339,-7.916720,Kwa Kasimu,0,Rufiji,...,borehole,groundwater,1,communal standpipe,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19959,0.0,2012-10-10,Dwsp,0,DWE,33.469236,-3.607610,Shuleni,0,Internal,...,rainwater harvesting,surface,1,communal standpipe,0,1,0,0,0,0
31378,0.0,2013-01-31,Finw,279,FinW,39.587734,-10.682716,Shuleni,0,Ruvuma / Southern Coast,...,borehole,groundwater,2,communal standpipe,0,0,1,0,0,0
73223,0.0,2011-03-04,Rc Ch,1736,RC Ch,34.875985,-9.591650,none,0,Lake Nyasa,...,spring,groundwater,1,communal standpipe,1,0,0,0,0,0
49904,0.0,2013-03-20,World Vision,1322,Community,36.797282,-3.355298,Nikodemo Risa,0,Pangani,...,spring,groundwater,1,communal standpipe,0,0,1,0,0,0


In [19]:
for i, c in enumerate(my_dict['classes']):
    if counts[i] <= 10:
        target0_smoothed = (counts[i]-1)/10 * my_dict[target[0]][i] + (11-counts[i])/10 * label1_avg
        target1_smoothed = (counts[i]-1)/10 * my_dict[target[1]][i] + (11-counts[i])/10 * label2_avg
        target2_smoothed = (counts[i]-1)/10 * my_dict[target[2]][i] + (11-counts[i])/10 * label2_avg
    else:
        target0_smoothed = my_dict[target[0]][i]
        target1_smoothed = my_dict[target[1]][i]
        target2_smoothed = my_dict[target[2]][i]

    # Assign that value to each to each instance which has the corresponding feature value
    df_train1.loc[df_train1[col]==c,col+'_functional'] = target0_smoothed
    df_train1.loc[df_train1[col]==c,col+'_repair'] = target1_smoothed
    df_train1.loc[df_train1[col]==c,col+'_nonfunctional'] = target2_smoothed


    if c in df_test[col].unique():
        df_test.loc[df_test[col]==c, col+'_functional'] = target0_smoothed
        df_test.loc[df_test[col]==c, col+'_repair'] = target1_smoothed
        df_test.loc[df_test[col]==c, col+'_nonfunctional'] = target2_smoothed

print('complete.')

complete.


In [20]:
df_train1

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,waterpoint_type_group,status_group_functional,status_group_functional needs repair,status_group_non functional,lga_functional,lga_repair,lga_nonfunctional,ward_functional,ward_repair,ward_nonfunctional
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34470,0.0,2011-07-29,Hesawa,0,DWE,33.037573,-2.503828,Kwa Mbisu,0,Lake Victoria,...,other,0,0,1,0.334906,0.086478,0.578616,0.225000,0.025000,0.750000
55171,200.0,2013-01-20,Rvemp,1155,DWE,33.378965,-2.154466,Kwachisaku Mwndu,0,Lake Victoria,...,hand pump,0,0,1,0.310976,0.009146,0.679878,0.438979,0.036139,0.369473
72157,0.0,2011-04-09,Government Of Tanzania,0,Central Government,33.429493,-9.026036,Church Of God,0,Lake Rukwa,...,communal standpipe,1,0,0,0.624277,0.057803,0.317919,0.772312,0.036139,0.036139
53472,0.0,2011-03-12,Amref,13,AMREF,39.213282,-7.211929,Kwa Mbwela,0,Wami / Ruvu,...,hand pump,1,0,0,0.476658,0.007371,0.515971,0.375000,0.000000,0.625000
14717,0.0,2011-03-02,,-37,,39.655339,-7.916720,Kwa Kasimu,0,Rufiji,...,communal standpipe,1,0,0,0.712871,0.099010,0.188119,0.833333,0.027778,0.138889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19959,0.0,2012-10-10,Dwsp,0,DWE,33.469236,-3.607610,Shuleni,0,Internal,...,communal standpipe,0,1,0,0.603896,0.045455,0.350649,0.585366,0.024390,0.390244
31378,0.0,2013-01-31,Finw,279,FinW,39.587734,-10.682716,Shuleni,0,Ruvuma / Southern Coast,...,communal standpipe,0,0,1,0.287879,0.045455,0.666667,0.272727,0.000000,0.727273
73223,0.0,2011-03-04,Rc Ch,1736,RC Ch,34.875985,-9.591650,none,0,Lake Nyasa,...,communal standpipe,1,0,0,0.802017,0.038217,0.159766,1.000000,0.000000,0.000000
49904,0.0,2013-03-20,World Vision,1322,Community,36.797282,-3.355298,Nikodemo Risa,0,Pangani,...,communal standpipe,0,0,1,0.647292,0.034346,0.318362,0.428571,0.000000,0.571429
