# 1. Load libraries and datasets.

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [3]:
train = pd.read_csv('inputs/Train.csv')
test = pd.read_csv('inputs/Test.csv')
ss = pd.read_csv('inputs/SampleSubmission.csv')
variable_data = pd.read_csv('inputs/variable_descriptions.csv')

In [4]:
train.head()

Unnamed: 0,ward,total_households,total_individuals,target,dw_00,dw_01,dw_02,dw_03,dw_04,dw_05,...,pw_03,pw_04,pw_05,pw_06,pw_07,pw_08,ADM4_PCODE,lat,lon,NL
0,41601001: Ward 1,1674.45058,5888.2075,16.773757,0.933841,0.000846,0.00549,0.000676,0.0,0.001372,...,0.002848,0.007537,0.0,0.012928,0,0,ZA4161001,-29.68227,24.734743,0.292039
1,41601002: Ward 2,1736.9923,6735.33812,21.496661,0.69694,0.001253,0.004402,0.0,0.002301,0.001323,...,0.014566,0.057127,0.019092,0.004131,0,0,ZA4161002,-29.119311,24.757737,3.207775
2,41601003: Ward 3,2403.57591,7273.04995,10.931425,0.810545,0.004517,0.008891,0.003986,0.007735,0.000956,...,0.05756,0.010358,0.001421,0.040881,0,0,ZA4161003,-29.142276,25.094093,0.0
3,41601004: Ward 4,1740.78737,5734.49046,23.119257,0.659914,0.0,0.006129,0.0,0.000813,0.037245,...,0.0,0.000669,0.0,0.005011,0,0,ZA4161004,-29.372052,24.942867,2.038778
4,41601005: Ward 5,1730.51451,6657.23835,13.652252,0.950575,0.000655,0.001473,0.000598,0.006999,0.000818,...,0.004859,0.00129,0.000673,0.017629,0,0,ZA4161005,-29.409381,25.290165,0.0


In [5]:
variable_data

Unnamed: 0,Column,Description,Unnamed: 2,Unnamed: 3
0,dw_00,Percentage of dwellings of type: House or bric...,,
1,dw_01,Percentage of dwellings of type: Traditional d...,,
2,dw_02,Percentage of dwellings of type: Flat or apart...,,
3,dw_03,Percentage of dwellings of type: Cluster house...,,
4,dw_04,Percentage of dwellings of type: Townhouse (se...,,
5,dw_05,Percentage of dwellings of type: Semi-detached...,,
6,dw_06,Percentage of dwellings of type: House/flat/ro...,,
7,dw_07,Percentage of dwellings of type: Informal dwel...,in backyard),
8,dw_08,Percentage of dwellings of type: Informal dwel...,not in backyard,e.g. in an informal/squatter settlement or on...
9,dw_09,Percentage of dwellings of type: Room/flatlet ...,,


In [6]:
dw_cols = [cols for cols in train if cols.startswith('dw')]
psa_cols = [cols for cols in train if cols.startswith('psa')]
stv_cols = [cols for cols in train if cols.startswith('stv')]
car_cols = [cols for cols in train if cols.startswith('car')]
lln_cols = [cols for cols in train if cols.startswith('lln')]
lan_cols = [cols for cols in train if cols.startswith('lan')]
pg_cols = [cols for cols in train if cols.startswith('pg')]
pw_cols = [cols for cols in train if cols.startswith('pw')]

# 2. Exploratory Data Analysis.

In [None]:
train.plot(kind = 'scatter', x = 'lat', y = 'lon', alpha = 0.6, label = 'Percentage of women households with income under R19.6k out of total number of households.',
           c = 'target', cmap = plt.get_cmap('jet'), colorbar = True,
           figsize = (20, 8))

plt.title('Percentage of women households with income under R19.6k out of total number of households.')
plt.legend()

In [None]:
corr = train[dw_cols].corr()
sns.set_style('darkgrid')
plt.figure(figsize = (20, 7))
sns.heatmap(corr.abs(), annot = True)
plt.show()

In [None]:
corr = train[psa_cols].corr()
plt.figure(figsize = (20, 7))
sns.heatmap(corr, annot = True)
plt.show()

In [None]:
sns.pairplot(train[stv_cols])

In [None]:
sns.pairplot(train[car_cols])

In [None]:
sns.pairplot(train[lln_cols])

In [None]:
corr = train[lan_cols].corr()
plt.figure(figsize = (12, 8))
sns.heatmap(corr, cmap = 'bwr', annot = True)
plt.show()

In [None]:
sns.pairplot(train[pg_cols])

In [None]:
plt.figure(figsize = (12, 6))
sns.heatmap(train[pg_cols].corr(), annot = True)

In [None]:
corr = train[pw_cols].corr()
plt.figure(figsize = (12, 5))
sns.heatmap(corr.abs(), annot = True, cmap = 'Greens')
plt.show()

In [None]:
sns.pairplot(train[psa_cols])

In [None]:
corr = train[psa_cols].corr()
plt.figure(figsize = (12, 8))
sns.heatmap(corr)
plt.show()

In [None]:
train.plot(kind = 'scatter', x = 'lat', y = 'lon', alpha = 0.4, label = 'Night lights distribution.',
           c = 'NL', cmap = plt.get_cmap('jet'), colorbar = True,
           figsize = (20, 8))

plt.title('Night lights distribution.')
plt.legend()

# 3. Feature engineering.

In [7]:
# Merge columns with high correlation.
for dataset in (train, test):

    dataset['sum_of_stv_cols'] = dataset[stv_cols].sum(axis = 1)

    dataset['sum_of_car_cols'] = dataset[car_cols].sum(axis = 1)

    dataset['sum_of_lln_cols'] = dataset[lln_cols].sum(axis = 1)
    
    dataset['sum_of_dw_cols '] = dataset[dw_cols].sum(axis = 1)
    
    dataset['ratio_stv'] = dataset['stv_01'] / dataset['stv_00']
    
    dataset['ratio_car'] = dataset['car_01'] / dataset['car_00']
    
    dataset['ratio_lln'] = dataset['lln_01'] / dataset['lln_00']
    
    dataset['ratio_psa'] = dataset['psa_01'] / dataset['psa_00']

In [8]:
PATTERN = r'[\d\s]+:'

for dataset in (train, test):
    dataset['ward_cat'] = dataset['ward'].str.replace(PATTERN, '', regex=True)

In [9]:
#Implement data transformation steps.
categorical_cols = [cols for cols in train.columns if train[cols].dtype == 'object']
categorical_cols

['ward', 'ADM4_PCODE', 'ward_cat']

In [10]:
label_train = train.copy()
label_test = test.copy()

In [11]:
#Preprocessing categorical data.
encoder = OrdinalEncoder(handle_unknown= 'error')

In [12]:
#Encode categorical values.
label_train[categorical_cols] = encoder.fit_transform(label_train[categorical_cols])
label_test[categorical_cols] = encoder.fit_transform(label_test[categorical_cols])

# #Changing option to use infinite as nan.
# pd.set_option('mode.use_inf_as_na', True)
# df.dropna(inplace = True)

#Replace infinite or Nan values and drop Nan values.
label_train.replace([np.inf, -np.inf], np.nan, inplace = True)
label_train.dropna(inplace = True)

label_test.replace([np.inf, -np.inf], np.nan, inplace = True)
label_test.dropna(inplace = True)

In [13]:
cols_to_drop = ['ADM4_PCODE', 'ward']
#'stv_01', 'lln_01', 'car_01', 'total_households', 'psa_01', 'psa_02', 'psa_03', 'dw_03', 'lan_13', 'lan_06', 'psa_04', 'pw_07', 'pg_00'

In [14]:
label_train = label_train.drop(cols_to_drop, axis = 1)
label_test = label_test.drop(cols_to_drop, axis = 1 )

X = label_train.drop('target', axis = 1)
y = label_train.target
X.shape, y.shape

((2821, 69), (2821,))

In [17]:
#Create pipeline
rfe = RFE(estimator = RandomForestRegressor(), verbose = 1)
model = RandomForestRegressor()
pipeline = Pipeline(
    steps = [
        ('s', rfe), ('m', model)
    ])

In [18]:
cv = RepeatedKFold(n_splits = 3, n_repeats = 1, random_state = 42)
n_scores = cross_val_score(pipeline, X, y, scoring = 'neg_mean_absolute_error',
                           cv = cv, n_jobs = 6, verbose = 1)

print(f'MAE: {np.mean(-1 * n_scores)}, STD_DEV: {np.std(n_scores)}')

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


MAE: 2.6502596796072733, STD_DEV: 0.07520246092303219


[Parallel(n_jobs=6)]: Done   3 out of   3 | elapsed:  8.9min finished


In [20]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

model = RandomForestRegressor(n_estimators = 100, random_state = 42, n_jobs = 6)
model.fit(X_train, y_train)

preds = model.predict(X_test)

mse = mean_absolute_error(preds, y_test)
print(f'MAE : {mean_absolute_error(preds, y_test)}')
print(f'MSE: {np.sqrt(mean_absolute_error(preds, y_test))}')

MAE : 2.5934435144875594
MSE: 1.6104171864729833


In [None]:
#label_test = label_test.drop(cols_to_drop, axis = 1)
label_test.shape

(1013, 69)

In [None]:
predictions = model.predict(label_test)

In [None]:
sub_file = ss.copy()
sub_file['target'] = predictions

In [None]:
sub_file.to_csv('submissions/76th_submission.csv', index = False)

In [None]:
corr_matrix = label_train_1[dw_cols].corr().abs()
mask = np.triu(np.ones_like(corr_matrix, dtype = bool))

#Columns with highly correlated features. (r > 0.6)
to_drop = [col for col in label_train_1[dw_cols].columns if any (label_train_1[col] >  0.90)]
to_drop

0        0
1       42
2       53
3       64
4       75
        ..
2817     0
2818    42
2819    53
2820    64
2821    75
Length: 2822, dtype: int64