In [1]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns

from scipy.stats import randint

import category_encoders as ce
from category_encoders.target_encoder import TargetEncoder

from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import eli5
from eli5.sklearn import PermutationImportance

pd.set_option('display.max_columns', 500)

In [2]:
data         = pd.read_csv('train_features.csv')
feature_info = pd.read_csv('train_labels.csv')
X_val        = pd.read_csv('test_features.csv')
sample_submi = pd.read_csv('sample_submission.csv')


feature_info = feature_info.drop(columns = 'id')
df = pd.concat([data, feature_info], sort = False, axis = 1)


print(df.shape, '\n')
df.head()

(59400, 41) 



Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [3]:
def null_wrangle(X):
    
    Xn = X.copy()
    
    columns = Xn.select_dtypes('number').columns
    
    Xn[columns] = Xn[columns].replace(0,np.nan)
    Xn['latitude'] = Xn['latitude'].replace(-2.000000e-08 ,np.nan)
    
    Xn['lon_bin'] = round(Xn['longitude'], 2)
    Xn['lat_bin'] = round(Xn['latitude'], 2)
   # Xn['lon_bin_1'] = round(Xn['longitude'], 1)
    #Xn['lat_bin_1'] = round(Xn['latitude'], 1)
    
    columns = Xn.select_dtypes('number').columns
    
    for feature in columns:
        replacements = Xn[columns].groupby('lon_bin')[feature].transform('mean')
        Xn[feature] = Xn[feature].fillna(replacements)

    for feature in columns:
        replacements = Xn[columns].groupby('lat_bin')[feature].transform('mean')
        Xn[feature] = Xn[feature].fillna(replacements)
    
    for feature in columns:
        replacements = Xn[columns].groupby('region_code')[feature].transform('mean')
        Xn[feature] = Xn[feature].fillna(replacements)
    


    columns = Xn.select_dtypes(exclude = 'number')
    
    for feature in columns:
        Xn[feature] = Xn[feature].str.lower()
        
    Xn['quantity'] = Xn['quantity'].replace({'enough' : 4 , 'insufficient' : 3 , 
                                             'seasonal' : 2 , 'dry' : 1 , 'unknown' : 0})    
    
    nan_list = ['0','not known','unknown','none','-','##','not kno','unknown installer']
    Xn = Xn.replace(nan_list, np.nan)
    
    return Xn

In [4]:
def date_wrangle(X):
    
    # Datetime Conversion
    X['date_recorded'] = pd.to_datetime(data['date_recorded'], infer_datetime_format = True)

    # Extract Year and Month
    X['year_recorded'] = X['date_recorded'].map(lambda x: x.strftime('%Y')).astype(int)
   # X['Month'] = X['date_recorded'].map(lambda x: x.strftime('%m'))

    X['years_in_operation'] = X['year_recorded'] - X['construction_year']
    
    X = X.drop(columns = ['date_recorded'])
    
    return X

In [5]:
def drop_features(X):
        X = X.drop(columns = ['num_private','latitude','longitude','public_meeting',
                              'scheme_name','scheme_management','permit','payment',
                              'payment_type','subvillage','wpt_name','lat_bin','lon_bin'])
        
        return X

In [6]:
def low_occurance_wrangle(X):
    columns = []

    for feature in X.select_dtypes(exclude ='number').columns:
        if X[feature].nunique() > 20:
            columns.append(feature)
        else:
            pass

    X[columns] = X[columns].apply(lambda x:
                         x.mask(x.map(x.value_counts())<5, np.nan))
    
    return(X)

In [7]:
def dropping_nulls(X):
    return X.dropna()

In [8]:
def ord_encode(X): 
    
    features = X.columns
    
    ord_encoder = ce.OrdinalEncoder()
    X = ord_encoder.fit_transform(X)
    
    X = pd.DataFrame(X, columns = features)
    
    return X

In [9]:
from category_encoders.target_encoder import TargetEncoder
def loo_encode_val(X, y, val):
    
    """
    Target encodes a DF of categorical features, based on the three
    component vectors of y_true.  Target encoding is designed to work with
    binary labels; in order to make it work with a vector that has three
    values, I target encode against a binary version of each and then
    concatenate the results.

    Parameters
    ----------
    X : pandas.DataFrame
        Dataset to be fixed
        
    cats : List of categorical columns to encode

    train_labels : pandas.DataFrame
                    The vector of training labels

    Returns
    ----------
    Xn : pandas.DataFrame
            Fixed vector

    """
    nominal_features = X.select_dtypes(exclude = 'number').columns

    # Make a clean copy, to ensure we're not changing the original DF
    Xn = X.copy()
    
    y_works = [1.0 if x == 'functional' else 0.0 for x in y]
    y_broken = [1.0 if x == 'non functional' else 0.0 for x in y]
    y_repair = [1.0 if x == 'functional needs repair' else 0.0 for x in y]

    y_vectors = [y_works, y_broken, y_repair]
    X_te_all = []

    # We want to create encoding based on the training features and 
    # labels, but apply this encoding to any vector (such as X_test)
    for i in [1,2,3]:
        # Make an encoder
        TE = ce.LeaveOneOutEncoder()
        
        # Fit it to the training data
        TE.fit(X=Xn[nominal_features], y=y_vectors[i-1])

        # Transform the cat columns in X
        X_te = TE.transform(val[nominal_features])
        
        # Give them custom names, so that the columns encoded against
        # each target vector have a different name
        X_te = X_te.rename(columns=(lambda x: x + '_te' + str(i)))
        X_te_all.append(X_te)

    new_features = pd.concat(X_te_all, sort=False, axis=1)
    
    val = val.drop(columns=nominal_features)
    val = pd.concat([val,new_features], sort=False, axis=1)
    
    return val


def loo_encode_cats(X, y):
    
    """
    Target encodes a DF of categorical features, based on the three
    component vectors of y_true.  Target encoding is designed to work with
    binary labels; in order to make it work with a vector that has three
    values, I target encode against a binary version of each and then
    concatenate the results.

    Parameters
    ----------
    X : pandas.DataFrame
        Dataset to be fixed
        
    cats : List of categorical columns to encode

    train_labels : pandas.DataFrame
                    The vector of training labels

    Returns
    ----------
    Xn : pandas.DataFrame
            Fixed vector

    """
    nominal_features = X.select_dtypes(exclude = 'number').columns

    # Make a clean copy, to ensure we're not changing the original DF
    Xn = X.copy()
    
    y_works = [1.0 if x == 'functional' else 0.0 for x in y]
    y_broken = [1.0 if x == 'non functional' else 0.0 for x in y]
    y_repair = [1.0 if x == 'functional needs repair' else 0.0 for x in y]

    y_vectors = [y_works, y_broken, y_repair]
    X_te_all = []

    # We want to create encoding based on the training features and 
    # labels, but apply this encoding to any vector (such as X_test)
    for i in [1,2,3]:
        # Make an encoder
        TE = ce.LeaveOneOutEncoder()
        
        # Fit it to the training data
        TE.fit(X=Xn[nominal_features], y=y_vectors[i-1])

        # Transform the cat columns in X
        X_te = TE.transform(Xn[nominal_features])
        
        # Give them custom names, so that the columns encoded against
        # each target vector have a different name
        X_te = X_te.rename(columns=(lambda x: x + '_te' + str(i)))
        X_te_all.append(X_te)

    new_features = pd.concat(X_te_all, sort=False, axis=1)
    
    Xn = Xn.drop(columns=nominal_features)
    Xn = pd.concat([Xn,new_features], sort=False, axis=1)
    
    return Xn

In [10]:
def wrangle(X):
    
    Xn = X.copy()
    Xn = null_wrangle(Xn)
    Xn = date_wrangle(Xn)
    Xn = drop_features(Xn)
    Xn = low_occurance_wrangle(Xn)
    Xn = dropping_nulls(Xn)
    
    return Xn

In [11]:
X     = wrangle(df)
X_val = wrangle(X_val)

y = X['status_group']
X = X.drop(columns = 'status_group')

X_val = loo_encode_val(X , y, X_val)
X     = loo_encode_cats(X , y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39978, 70), (9995, 70), (39978,), (9995,))

In [None]:
param_distributions = {
    'n_estimators': randint(50, 300), 
    'max_depth': randint(1,5),
}

Gradient_Boost_TE = RandomizedSearchCV(
    estimator = XGBClassifier(n_jobs = -1, random_state = 42),
    param_distributions = param_distributions, 
    scoring = 'accuracy',
    n_iter = 10,
    n_jobs = -1,
    cv= 3,
    verbose = 10,
    return_train_score = True,
    random_state = 42
)

Gradient_Boost_TE.fit(X, y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   58.5s


In [None]:
best_GB_TE = Gradient_Boost_TE.best_estimator_
# Best Accuracy Score
Gradient_Boost_TE.best_score_

In [None]:
y_pred = best_GB_TE.predict(X_val)
pd.DataFrame(y_pred)[0].value_counts(normalize = True)

In [None]:
y.value_counts(normalize = True)

In [None]:
figsize = (5, 10)
importances = pd.Series(best_GB_TE.feature_importances_, X.columns)
importances.sort_values().plot.barh(color = 'gray', figsize = figsize);