In [1]:
#import statements
import pandas as pd
from sklearn.model_selection import train_test_split
import time

In [2]:
# import preprocessed data
X_preprocessed = pd.read_csv('data/training_data_independent.csv', index_col='id')
y_preprocessed = pd.read_csv('data/training_data_labels.csv', index_col='id')

# Concatenate dependent and independent variables
data = pd.concat([X_preprocessed, y_preprocessed.status_group], axis=1)

In [14]:
# Separate predictors and target
X = data.drop(columns=['status_group'])
y = data[['status_group']].copy()

# Split the training data into more testing /  training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)

# Add the target feature back to the training data and target encode the data
X_train = pd.concat([X_train, y_train], axis=1)
X_test = pd.concat([X_test, y_test], axis=1)
X_train_two_class = X_train.loc[(X_train.status_group == 'functional')|(X_train.status_group == 'non functional')].copy()
X_test_two_class = X_test.loc[(X_test.status_group == 'functional')|(X_test.status_group == 'non functional')].copy()

In [22]:
def target_encode_columns_new(train, test, features, target, remove_features=True):
    '''
    Target encode the features of the training set and its associated testing set.
    
    Parameters
    ----------
    train: dataframe, the training data which includes the target feature
    test: dataframe, the testing data which does not include the target feature
    cols: list, columns to target encode
    target: string, name of the target feature
    remove_features: boolean, indicates whether or not to return original features
    
    Returns
    ---------
    DataFrame
        Training dataframe with new target encoded feature columns, one column for each target class
        Testing dataframe with new target encoded feature columns, one column for each target class
    '''
        
    df_train = train.copy()
    df_test = test.copy()
    
    n = len(df_train)
    labels = sorted(list(df_train[target].unique()))
    num_target_classes = len(labels)

    # Get dataset averages of each target class
    label_avgs = []
    for label in labels:
        label_avgs.append(len(df_train.loc[df_train[target] == label])/n)
    
    # Get target dummies and list of new target dummy variable columns names
    df_train1 = pd.get_dummies(df_train, columns=[target])
    targets = list(df_train1.columns[-num_target_classes:])
    
    for col in features:
        print('Encoding', col, end='... ')
        
        # Get list of feature classes and list of their class counts
        unique_c = list(df_train1.groupby(by=col).mean().index)
        counts = list(df_train1.groupby(by=col).count()[targets[0]])

        # Get list of each target class' mean for the feature's unique classes
        target_means = []
        for target in targets:
            target_means.append(df_train1.groupby(by=col).mean()[target])
        
        # Initialize training testing columns for feature's target encoded columns
        for label in labels:
            df_train[col+'_'+label] = 0
            df_test[col+'_'+label] = 0
        
        # For each unique class of the feature, calculate smoothed target mean (one for each of the 3 target classes)        
        for i, c in enumerate(unique_c):
            class_prob = []
            if counts[i] <= 10:
                for t_i, label in enumerate(labels):
                    class_prob.append((counts[i]-1)/10 * target_means[t_i][i] + (11-counts[i])/10 * label_avgs[t_i])
            else:
                for t_i, label in enumerate(labels):
                    class_prob.append(target_means[t_i][i])
            
            # Assign (smoothed) probability value to each to each instance of the corresponding feature value of df_train
            for t_i, label in enumerate(labels):
                df_train.loc[df_train[col]==c,col+'_'+label] = class_prob[t_i]
            
            # Assign (smoothed) probability value to each to each instance of the corresponding feature value of df_test
            if c in df_test[col].unique():
                for l_i, label in enumerate(labels):
                    df_test.loc[df_test[col]==c,col+'_'+label] = class_prob[l_i]
        print('complete.')
    
    if remove_features:
        df_train.drop(columns=features, inplace=True)
        df_test.drop(columns=features, inplace=True)
    return df_train, df_test

In [16]:
X_train_new, X_test_new = target_encode_columns_new(X_train_two_class, X_test, ['lga'], 'status_group', remove_features=False)

Encoding lga... 326 0.3374233128834356 0.41294459230583114
complete.


In [29]:
X_train3, X_test3 = target_encode_columns_new(X_train, X_test,['lga'], 'management_group')

Encoding lga... complete.


In [30]:
X_train3

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,lga_commercial,lga_other,lga_parastatal,lga_unknown,lga_user-group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34470,0.0,2011-07-29,Hesawa,0,DWE,33.037573,-2.503828,Kwa Mbisu,0,Lake Victoria,...,shallow well,groundwater,other,other,non functional,0.001572,0.004717,0.011006,0.000000,0.982704
55171,200.0,2013-01-20,Rvemp,1155,DWE,33.378965,-2.154466,Kwachisaku Mwndu,0,Lake Victoria,...,shallow well,groundwater,hand pump,hand pump,non functional,0.015244,0.000000,0.210366,0.006098,0.768293
72157,0.0,2011-04-09,Government Of Tanzania,0,Central Government,33.429493,-9.026036,Church Of God,0,Lake Rukwa,...,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,0.000000,0.000000,0.002890,0.000000,0.997110
53472,0.0,2011-03-12,Amref,13,AMREF,39.213282,-7.211929,Kwa Mbwela,0,Wami / Ruvu,...,shallow well,groundwater,hand pump,hand pump,functional,0.122850,0.000000,0.004914,0.000000,0.872236
14717,0.0,2011-03-02,,-37,,39.655339,-7.916720,Kwa Kasimu,0,Rufiji,...,borehole,groundwater,communal standpipe,communal standpipe,functional,0.000000,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19959,0.0,2012-10-10,Dwsp,0,DWE,33.469236,-3.607610,Shuleni,0,Internal,...,rainwater harvesting,surface,communal standpipe,communal standpipe,functional needs repair,0.090909,0.000000,0.240260,0.019481,0.649351
31378,0.0,2013-01-31,Finw,279,FinW,39.587734,-10.682716,Shuleni,0,Ruvuma / Southern Coast,...,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,0.000000,0.000000,0.000000,0.000000,1.000000
73223,0.0,2011-03-04,Rc Ch,1736,RC Ch,34.875985,-9.591650,none,0,Lake Nyasa,...,spring,groundwater,communal standpipe,communal standpipe,functional,0.006900,0.000000,0.003185,0.005308,0.984607
49904,0.0,2013-03-20,World Vision,1322,Community,36.797282,-3.355298,Nikodemo Risa,0,Pangani,...,spring,groundwater,communal standpipe,communal standpipe,non functional,0.015852,0.000000,0.000000,0.017173,0.966975


In [28]:
X_train.management_group.value_counts()

user-group    39371
commercial     2728
parastatal     1313
other           715
unknown         423
Name: management_group, dtype: int64

In [17]:
def target_encode_columns(train, test, features, target, remove_features=True):
    '''
    Target encode the features of the training set and its associated testing set. This function considers a ternary target
    only.
    
    Parameters
    ----------
    train: dataframe, the training data which includes the target feature
    test: dataframe, the testing data which does not include the target feature
    cols: list, columns to target encode
    target: string, name of the target feature
    
    Returns
    ---------
    DataFrame
        Training dataframe with new target encoded feature columns, one column for each target class
        Testing dataframe with new target encoded feature columns, one column for each target class
    '''
    
    df_train = train.copy()
    df_test = test.copy()
    
    n = len(df_train)
    labels = sorted(list(df_train[target].unique()))

    # Get dataset averages of each target class
    label1_avg = len(df_train.loc[df_train[target] == labels[0]])/n
    label2_avg = len(df_train.loc[df_train[target] == labels[1]])/n
    label3_avg = len(df_train.loc[df_train[target] == labels[2]])/n
    
    # Get target dummies and list of new target dummy variable columns names
    df_train1 = pd.get_dummies(df_train, columns=[target])
    targets = list(df_train1.columns[-3:])
    
    for col in features:
        print('Encoding', col, end='... ')
        
        # Get list of feature classes and list of their class counts
        unique_c = list(df_train1.groupby(by=col).mean().index)
        counts = list(df_train1.groupby(by=col).count()[targets[0]])

        # Get list of each target class' mean for the feature's unique classes
        target0_means = list(df_train1.groupby(by=col).mean()[targets[0]])
        target1_means = list(df_train1.groupby(by=col).mean()[targets[1]])
        target2_means = list(df_train1.groupby(by=col).mean()[targets[2]])
      
        # Initialize new training and testing columns for the feature's target encoded columns
        df_train[col+'_functional'], df_train[col+'_repair'], df_train[col+'_nonfunctional'] = 0, 0, 0
        df_test[col+'_functional'], df_test[col+'_repair'], df_test[col+'_nonfunctional'] = 0, 0, 0
        
        # For each unique class of the feature, calculate smoothed target mean (one for each of the 3 target classes)        
        for i, c in enumerate(unique_c):
            if counts[i] <= 10:
                target0_smoothed = (counts[i]-1)/10 * target0_means[i] + (11-counts[i])/10 * label1_avg
                target1_smoothed = (counts[i]-1)/10 * target1_means[i] + (11-counts[i])/10 * label2_avg
                target2_smoothed = (counts[i]-1)/10 * target2_means[i] + (11-counts[i])/10 * label3_avg
            else:
                target0_smoothed = target0_means[i]
                target1_smoothed = target1_means[i]
                target2_smoothed = target2_means[i]
                if c == 'Mbeya Rural':
                    print(counts[i],target0_means[i],label1_avg)
            
            # Assign that value to each to each instance which has the corresponding feature value
            df_train.loc[df_train1[col]==c,col+'_functional'] = target0_smoothed
            df_train.loc[df_train1[col]==c,col+'_repair'] = target1_smoothed
            df_train.loc[df_train1[col]==c,col+'_nonfunctional'] = target2_smoothed
            
            
            if c in df_test[col].unique():
                df_test.loc[df_test[col]==c, col+'_functional'] = target0_smoothed
                df_test.loc[df_test[col]==c, col+'_repair'] = target1_smoothed
                df_test.loc[df_test[col]==c, col+'_nonfunctional'] = target2_smoothed

        print('complete.')
    if remove_features:
        df_train.drop(columns=features, inplace=True)
        df_test.drop(columns=features, inplace=True)
    return df_train, df_test, unique_c

In [18]:
X_train_old, X_test_old, classes = target_encode_columns(X_train, X_test, ['lga'], 'status_group', remove_features=False)

Encoding lga... 346 0.6242774566473989 0.5446240179573513
complete.


In [16]:
false_indexes = X_train2.loc[X_train2.status_2 ==False].index
df_missmatched = pd.concat([X_train2.loc[false_indexes,'lga_functional'], X_train4.loc[false_indexes,'lga_functional']],axis=1)
df_missmatched

Unnamed: 0_level_0,lga_functional,lga_functional
id,Unnamed: 1_level_1,Unnamed: 2_level_1
72157,0.624277,0.662577
53472,0.476658,0.480198
14717,0.712871,0.791209
42001,0.710638,0.724512
37990,0.700599,0.709091
...,...,...
45212,0.739677,0.750455
68461,0.570020,0.659817
49598,0.689076,0.735426
22544,0.644068,0.678571


In [28]:
X_train2.loc[X_train2.lga=='Mbeya Rural']

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,lga_functional,lga_functional needs repair,lga_non functional,status_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
72157,0.0,2011-04-09,Government Of Tanzania,0,Central Government,33.429493,-9.026036,Church Of God,0,Lake Rukwa,...,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,0.624277,0.057803,0.317919,False
67313,0.0,2011-04-03,Government Of Tanzania,0,Central Government,33.638365,-8.856877,Kwa Muislam,0,Rufiji,...,river,river/lake,surface,communal standpipe,communal standpipe,non functional,0.624277,0.057803,0.317919,
12343,0.0,2011-03-26,Danida,0,DANIDA,33.260579,-8.958039,Kwa Mchungaji,0,Lake Rukwa,...,river,river/lake,surface,communal standpipe,communal standpipe,non functional,0.624277,0.057803,0.317919,
73125,0.0,2011-03-31,Village Council,0,Village Council,33.274530,-8.853075,Kiliope,0,Lake Rukwa,...,river,river/lake,surface,communal standpipe,communal standpipe,functional,0.624277,0.057803,0.317919,False
27969,0.0,2011-03-28,Danida,0,DANIDA,33.325134,-8.932825,Ezekiel,0,Lake Rukwa,...,river,river/lake,surface,communal standpipe,communal standpipe,non functional,0.624277,0.057803,0.317919,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71009,0.0,2011-04-13,Rc Church,0,RC Church,33.664295,-9.007891,Kiela Simaya,0,Lake Nyasa,...,river,river/lake,surface,communal standpipe,communal standpipe,functional needs repair,0.624277,0.057803,0.317919,
27642,0.0,2011-04-12,Rc Church,0,RC Church,33.667766,-9.002189,Kwa Sisa,0,Lake Nyasa,...,river,river/lake,surface,communal standpipe,communal standpipe,functional,0.624277,0.057803,0.317919,False
12161,0.0,2011-04-05,Government Of Tanzania,0,Central Government,33.660561,-8.996184,Mwangoka,0,Lake Nyasa,...,river,river/lake,surface,communal standpipe,communal standpipe,non functional,0.624277,0.057803,0.317919,
4799,0.0,2011-03-30,Danida,0,DANIDA,33.359746,-8.873739,Kanisani Carvali Tabarneko,0,Lake Rukwa,...,river,river/lake,surface,communal standpipe,communal standpipe,functional,0.624277,0.057803,0.317919,False


In [None]:
X_train4.loc