In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import PowerTransformer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb

%matplotlib inline

In [2]:
# load data
X_train = pd.read_csv('train_features.csv')
y_train = pd.read_csv('train_labels.csv')
X_test = pd.read_csv('test_features.csv')
y_test = pd.read_csv('submission_format.csv')

# merge features and labels on train set
train = X_train.copy()
train = train.merge(y_train, how = 'left', on = 'id')

In [3]:
# column to always drop
columns_to_drop = [
    'id',
    'subvillage',
    'region_code',
    'district_code',
    'wpt_name',
    'recorded_by',
    'scheme_name',
    'management_group',
    'payment',
    'extraction_type_group',
    'extraction_type_class',
    'waterpoint_type_group',
    'quality_group',
    'quantity_group',
    'source_type',
    'source_class',
    'num_private', 
    'date_recorded',
    'scheme_management',
    'ward'
]

In [4]:
# drop columns
X_train.drop(columns_to_drop, axis = 1, inplace = True)
X_test.drop(columns_to_drop, axis = 1, inplace = True)

In [5]:
# show remaining columns
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 20 columns):
amount_tsh           59400 non-null float64
funder               55765 non-null object
gps_height           59400 non-null int64
installer            55745 non-null object
longitude            59400 non-null float64
latitude             59400 non-null float64
basin                59400 non-null object
region               59400 non-null object
lga                  59400 non-null object
population           59400 non-null int64
public_meeting       56066 non-null object
permit               56344 non-null object
construction_year    59400 non-null int64
extraction_type      59400 non-null object
management           59400 non-null object
payment_type         59400 non-null object
water_quality        59400 non-null object
quantity             59400 non-null object
source               59400 non-null object
waterpoint_type      59400 non-null object
dtypes: float64(3), int64(3), o

## Data Cleaning

In [6]:
# create a column storing the info whether construction year was recorded or not
X_train['construction_year_recorded'] = np.where(X_train.construction_year == 0, False, True)
X_test['construction_year_recorded'] = np.where(X_test.construction_year == 0, False, True)

# replace construction_year == 0 with the mean construction year
mean_construction_year = round(X_train.loc[X_train.construction_year != 0, 'construction_year'].mean(), 0)
X_train.loc[X_train.construction_year == 0, 'construction_year'] = mean_construction_year
X_test.loc[X_test.construction_year == 0, 'construction_year'] = mean_construction_year

In [7]:
# create a column storing the info whether longitude/latitude was recorded or not
X_train['longitude_recorded'] = np.where(abs(X_train.longitude) < 0.1, False, True)
X_train['latitude_recorded'] = np.where(abs(X_train.latitude) < 0.1, False, True)

X_test['longitude_recorded'] = np.where(X_test.longitude < 0.1, False, True)
X_test['latitude_recorded'] = np.where(X_test.latitude < 0.1, False, True)

# calculate the mean longitude/latitude for each region
mean_longitude = [X_train.loc[X_train.region == region,'longitude'].mean() for region in X_train.region.unique()]
mean_latitude = [X_train.loc[X_train.region == region,'latitude'].mean() for region in X_train.region.unique()]

mean_location = pd.DataFrame(data = {'mean_longitude' : mean_longitude,
                                     'mean_latitude' : mean_latitude},
                             index = X_train.region.unique())

# replace longitudes/latitudes close to 0 with the mean longitude/latitude for the region
for i in range(0, X_train.shape[0]):
    
    # replace longitudes around 0 with the mean for the respective region of the observation
    if abs(X_train.loc[i, 'longitude']) < 0.1:
        X_train.loc[i, 'longitude'] = mean_location.loc[X_train.loc[i, 'region'], 'mean_longitude']
        
    # do the same for the latitude
    if abs(X_train.loc[i, 'latitude']) < 0.1:
        X_train.loc[i, 'latitude'] = mean_location.loc[X_train.loc[i, 'region'], 'mean_latitude']

# same for test set
for i in range(0, X_test.shape[0]):
    
    # replace longitudes around 0 with the mean for the respective region of the observation
    if abs(X_test.loc[i, 'longitude']) < 0.1:
        X_test.loc[i, 'longitude'] = mean_location.loc[X_test.loc[i, 'region'], 'mean_longitude']
        
    # do the same for the latitude
    if abs(X_test.loc[i, 'latitude']) < 0.1:
        X_test.loc[i, 'latitude'] = mean_location.loc[X_test.loc[i, 'region'], 'mean_latitude']

In [8]:
# replace missing values in public_meeting with the majority category (True)
X_train.loc[X_train.public_meeting.isna(), 'public_meeting'] = True
X_test.loc[X_test.public_meeting.isna(), 'public_meeting'] = True

# replace missing values in permit with the majority category (True)
X_train.loc[X_train.permit.isna(), 'permit'] = True
X_test.loc[X_test.permit.isna(), 'permit'] = True

## Encode Target

In [29]:
# create a mapping for the multinomial classes
multinomial_classes = {
    'functional' : 0,
    'non functional' : 1,
    'functional needs repair' : 2
}

# create the inverse mapping
classes_inv = {v: k for k, v in multinomial_classes.items()}

# map the target to numerical
y_train_multinomial = y_train.status_group.map(multinomial_classes).copy()

# create binary classes for functional versus non-function
y_train_binary = np.where(y_train_multinomial == 1, 1, 0)

## Feature Creation

In [10]:
# create series of feature 'lga'
lgas_train = X_train.lga.copy()
lgas_test = X_test.lga.copy()

# create value count for lga
lga_counts = lgas_train.value_counts()

# create a mask filtering value counts lower than 200
mask_train = lgas_train.isin(lga_counts[lga_counts > 200].index)
mask_test = lgas_test.isin(lga_counts[lga_counts > 200].index)

# replace values
lgas_train[~mask_train] = 'Other'
lgas_test[~mask_test] = 'Other'

# dummy encode
lgas_train_dummy = pd.get_dummies(lgas_train)
lgas_test_dummy = pd.get_dummies(lgas_test)

# create and fit naive bayes model
nb_lga = GaussianNB()
nb_lga.fit(lgas_train_dummy, y_train_binary)

# create predictions and add them as new feature
X_train['lga'] = nb_lga.predict_proba(lgas_train_dummy)[:,1]
X_test['lga'] = nb_lga.predict_proba(lgas_test_dummy)[:,1]

In [11]:
# create series of feature 'installer'
installers_train = X_train.installer.copy()
installers_test = X_test.installer.copy()

# create value count for installer
installer_counts = installers_train.value_counts()

# create a mask filtering value counts lower than 100
mask_train = installers_train.isin(installer_counts[installer_counts > 100].index)
mask_test = installers_test.isin(installer_counts[installer_counts > 100].index)

# replace values
installers_train[~mask_train] = 'Other'
installers_test[~mask_test] = 'Other'

# dummy encode
installers_train_dummy = pd.get_dummies(installers_train)
installers_test_dummy = pd.get_dummies(installers_test)

# create and fit naive bayes model
nb_installer = GaussianNB()
nb_installer.fit(installers_train_dummy, y_train_binary)

# create predictions and add them as new feature
X_train['installer'] = nb_installer.predict_proba(installers_train_dummy)[:,1]
X_test['installer'] = nb_installer.predict_proba(installers_test_dummy)[:,1]

In [12]:
# create series of feature 'funder'
funders_train = X_train.funder.copy()
funders_test = X_test.installer.copy()

# create value count for funder
funder_counts = funders_train.value_counts()

# create a mask filtering value counts lower than 200
mask_train = funders_train.isin(funder_counts[funder_counts > 50].index)
mask_test = funders_test.isin(funder_counts[funder_counts > 50].index)

# replace values
funders_train[~mask_train] = 'Other'
funders_test[~mask_test] = 'Other'

# dummy encode
funders_train_dummy = pd.get_dummies(funders_train)
funders_test_dummy = pd.get_dummies(funders_test)

# create and fit naive bayes model
nb_funders = GaussianNB()
nb_funders.fit(funders_train_dummy, y_train_binary)

# create predictions and add them as new feature
X_train['funder'] = nb_installer.predict_proba(installers_train_dummy)[:,1]
X_test['funder'] = nb_installer.predict_proba(installers_test_dummy)[:,1]

## One-hot Encoding and Scaling

In [13]:
# one-hot encoding
X_train = pd.get_dummies(X_train, 
                         prefix = X_train.select_dtypes('object').columns, 
                         columns = X_train.select_dtypes('object').columns,
                         drop_first = False
                        )

X_test = pd.get_dummies(X_test, 
                         prefix = X_test.select_dtypes('object').columns, 
                         columns = X_test.select_dtypes('object').columns,
                         drop_first = False
                        )

# add columns to test set that only exist in train set
X_test[list(set(X_train.columns).difference(set(X_test.columns)))[0]] = 0

# make sure columns are in the same order
X_train = X_train[sorted(X_train.columns)].copy()
X_test = X_test[sorted(X_test.columns)].copy()

# convert boolean into numerical
for column in X_train.select_dtypes('bool').columns:
    X_train[column] = X_train[column].astype(int)
    X_test[column] = X_test[column].astype(int)

## Linear Discriminant Analysis

In [24]:
# create and fit an LDA model
lda = LinearDiscriminantAnalysis(solver = 'svd')
lda.fit(X_train, y_train_multinomial)

# transform X_train and X_test
lda_train = lda.transform(X_train)
lda_test = lda.transform(X_test)

In [25]:
# add linear discriminants to the train and test set
X_train['LD1'] = lda_train[:,0]
X_train['LD2'] = lda_train[:,1]
X_test['LD1'] = lda_test[:,0]
X_test['LD2'] = lda_test[:,1]

In [26]:
# create and fit a kNN model
knn = KNeighborsClassifier(n_neighbors = 21)
knn.fit(X_train[['LD1', 'LD2']], y_train_multinomial)

# predict probabilities
X_train['knn_proba_0'] = knn.predict_proba(X_train[['LD1', 'LD2']])[:,0]
X_train['knn_proba_1'] = knn.predict_proba(X_train[['LD1', 'LD2']])[:,1]
X_train['knn_proba_2'] = knn.predict_proba(X_train[['LD1', 'LD2']])[:,2]

X_test['knn_proba_0'] = knn.predict_proba(X_test[['LD1', 'LD2']])[:,0]
X_test['knn_proba_1'] = knn.predict_proba(X_test[['LD1', 'LD2']])[:,1]
X_test['knn_proba_2'] = knn.predict_proba(X_test[['LD1', 'LD2']])[:,2]

## Modelling

In [17]:
# define parameter grid
params = {
    'max_depth' : [10, 12, 14],
    'min_child_weight' : [3, 5, 7],
    'subsample' : [0.6, 0.7, 0.8] 
}

# create xgboost model
xgb_model = xgb.XGBClassifier(objective = 'multi:softmax',
                              learning_rate = 0.1,
                              num_classes = 3,
                              gamma = 0.001,
                              colsample_bytree = 0.6,
                              colsample_bylevel = 0.6,
                              colsample_bynode = 0.6,  
                              seed = 27)

# create grid search object
grid_xgb = GridSearchCV(estimator = xgb_model, 
                       param_grid = params, 
                       scoring='accuracy',
                       n_jobs=-1,
                       cv=5,
                       refit = True,
                       return_train_score = True,
                       verbose = 1)

# fit the model
grid_xgb.fit(X_train, y_train_multinomial)

# read results of grid search into dataframe
cv_results_df = pd.DataFrame(grid_xgb.cv_results_)

# print results
cv_results_df[['params', 'mean_train_score', 'mean_test_score']].sort_values(by = ['mean_test_score'], ascending = False)

Unnamed: 0,params,mean_train_score,mean_test_score
20,"{'max_depth': 14, 'min_child_weight': 3, 'subs...",0.913102,0.805825
23,"{'max_depth': 14, 'min_child_weight': 5, 'subs...",0.898005,0.80532
21,"{'max_depth': 14, 'min_child_weight': 5, 'subs...",0.887327,0.805269
11,"{'max_depth': 12, 'min_child_weight': 3, 'subs...",0.887201,0.805135
14,"{'max_depth': 12, 'min_child_weight': 5, 'subs...",0.875484,0.805067
10,"{'max_depth': 12, 'min_child_weight': 3, 'subs...",0.883392,0.804764
26,"{'max_depth': 14, 'min_child_weight': 7, 'subs...",0.887189,0.804529
19,"{'max_depth': 14, 'min_child_weight': 3, 'subs...",0.908603,0.804461
22,"{'max_depth': 14, 'min_child_weight': 5, 'subs...",0.893173,0.804394
17,"{'max_depth': 12, 'min_child_weight': 7, 'subs...",0.867689,0.804209


In [30]:
# make a prediction on test set
y_pred = grid_xgb.best_estimator_.predict(X_test)

# map back to string classes
y_pred = pd.Series(y_pred).map(classes_inv)

# create submission data frame
y_test.loc[:,'status_group'] = y_pred

# write to csv
y_test.to_csv('submission8.csv', index = False)

In [32]:
y_test.status_group.value_counts()

functional                 8487
non functional             5525
functional needs repair     838
Name: status_group, dtype: int64