#**Costa Rican Household Poverty Level Prediction - Kaggle**
###*Can you identify which households have the highest need for social welfare assistance?*
Efraín García Valencia - UdeA

#**1. Importing the DataSet from Kaggle**
###(Note that you will need your own API token from Kaggle)

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"efrangarcavalencia","key":"1b44762b46d8ef3248d00266066caf2d"}'}

In [3]:
#!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c costa-rican-household-poverty-prediction # Download the competition DataSet from Kaggle

Downloading costa-rican-household-poverty-prediction.zip to /content
  0% 0.00/2.33M [00:00<?, ?B/s]
100% 2.33M/2.33M [00:00<00:00, 162MB/s]


In [5]:
!mkdir DataSet # Create the "DataSet" folder if it doesn't already exist
!unzip -q costa-rican-household-poverty-prediction.zip train.csv test.csv -d DataSet # Unzip only the train.csv and test.csv files into the "DataSet" folder

In [6]:
!mkdir working

In [7]:
input_dir = 'DataSet/'
working_dir = 'working/'

#**2. Data Cleaning**

In [8]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
train = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test = pd.read_csv(os.path.join(input_dir, 'test.csv'))

train.index = train['Id'].values
test.index = test['Id'].values


In [10]:
def data_cleaning(data):
    data['dependency']=np.sqrt(data['SQBdependency'])
    data['rez_esc']=data['rez_esc'].fillna(0)
    data['v18q1']=data['v18q1'].fillna(0)
    data['v2a1']=data['v2a1'].fillna(0)

    conditions = [
    (data['edjefe']=='no') & (data['edjefa']=='no'),
    (data['edjefe']=='yes') & (data['edjefa']=='no'),
    (data['edjefe']=='no') & (data['edjefa']=='yes'),
    (data['edjefe']!='no') & (data['edjefe']!='yes') & (data['edjefa']=='no'),
    (data['edjefe']=='no') & (data['edjefa']!='no')
    ]
    choices = [0, 1, 1, data['edjefe'], data['edjefa']]
    data['edjefx']=np.select(conditions, choices)
    data['edjefx']=data['edjefx'].astype(int)
    data.drop(['edjefe', 'edjefa'], axis=1, inplace=True)

    meaneduc_nan=data[data['meaneduc'].isnull()][['Id','idhogar','escolari']]
    me=meaneduc_nan.groupby('idhogar')['escolari'].mean().reset_index()
    for row in meaneduc_nan.iterrows():
        idx=row[0]
        idhogar=row[1]['idhogar']
        m=me[me['idhogar']==idhogar]['escolari'].tolist()[0]
        data.at[idx, 'meaneduc']=m
        data.at[idx, 'SQBmeaned']=m*m

    return data

In [11]:
train = data_cleaning(train)
test = data_cleaning(test)

In [12]:
train = train.query('parentesco1==1')
train = train.drop('parentesco1', axis=1)
test = test.drop('parentesco1', axis=1)

In [13]:
def get_numeric(data, status_name):
    status_cols = [s for s in data.columns.tolist() if status_name in s]
    print('status column names')
    print(status_cols)
    status_df = data[status_cols]
    status_df.columns = list(range(status_df.shape[1]))
    status_numeric = status_df.idxmax(1)
    status_numeric.name = status_name
    data = pd.concat([data, status_numeric], axis=1)
    return data

In [14]:
status_name_list = ['epared', 'etecho', 'eviv', 'instlevel']
for status_name in status_name_list:
    train = get_numeric(train, status_name)
    test = get_numeric(test, status_name)

status column names
['epared1', 'epared2', 'epared3']
status column names
['epared1', 'epared2', 'epared3']
status column names
['etecho1', 'etecho2', 'etecho3']
status column names
['etecho1', 'etecho2', 'etecho3']
status column names
['eviv1', 'eviv2', 'eviv3']
status column names
['eviv1', 'eviv2', 'eviv3']
status column names
['instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']
status column names
['instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']


In [15]:
needless_cols = ['r4t3', 'tamhog', 'hogar_total', 'hhsize', 'v18q', 'sanitario1', 'agesq',
                 'mobilephone', 'area1', 'female', 'epared1', 'epared2',
                 'epared3', 'etecho1', 'etecho2', 'etecho3',
                 'eviv1', 'eviv2', 'eviv3', 'instlevel1', 'instlevel2',
                 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6',
                 'instlevel7', 'instlevel8', 'instlevel9', 'abastaguafuera']
SQB_cols = [s for s in train.columns.tolist() if 'SQB' in s]
parentesco_cols = [s for s in train.columns.tolist() if 'parentesco' in s]

needless_cols.extend(SQB_cols)
needless_cols.extend(parentesco_cols)

train = train.drop(needless_cols, axis=1)
test = test.drop(needless_cols, axis=1)

In [16]:
ori_train = pd.read_csv(os.path.join(input_dir, 'train.csv'))
ori_train_X = ori_train.drop(['Id', 'Target', 'idhogar'], axis=1)

train_X = train.drop(['Id', 'Target', 'idhogar'], axis=1)

print('feature columns \n {} -> {}'.format(ori_train_X.shape[1], train_X.shape[1]))

feature columns 
 140 -> 94


In [17]:
train_Id = train['Id']
train_idhogar = train['idhogar']
train_y = train['Target']
train_X = train.drop(['Id', 'Target', 'idhogar'], axis=1)

test_Id = test['Id']
test_idhogar = test['idhogar']
test_X = test.drop(['Id', 'idhogar'], axis=1)

all_Id = pd.concat([train_Id, test_Id], axis=0, sort=False)
all_idhogar = pd.concat([train_idhogar, test_idhogar], axis=0, sort=False)
all_X = pd.concat([train_X, test_X], axis=0, sort=False)

#**3. LGBM Classifier model training**

In [18]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, make_scorer
import lightgbm as lgb

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.1, random_state=0)

F1_scorer = make_scorer(f1_score, greater_is_better=True, average='macro')

gbm = lgb.LGBMClassifier()

gbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 590
[LightGBM] [Info] Number of data points in the train set: 2675, number of used features: 77
[LightGBM] [Info] Start training from score -2.593387
[LightGBM] [Info] Start training from score -1.915354
[LightGBM] [Info] Start training from score -2.132803
[LightGBM] [Info] Start training from score -0.416365


In [20]:
import pickle
with open(os.path.join(working_dir, '20180801_lgbm.pickle'), mode='wb') as f:
    pickle.dump(gbm, f)

In [21]:
pred = gbm.predict(test_X)
pred = pd.Series(data=pred, index=test_Id.values, name='Target')
pred = pd.concat([test_Id, pred], axis=1)
submission = pred
submission.to_csv('submission.csv', index=False)

In [22]:
y_test_pred = gbm.predict(X_test)
cm = confusion_matrix(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred, average='macro')
print("confusion matrix: \n", cm)
print("macro F1 score: \n", f1)

confusion matrix: 
 [[  5   7   1   9]
 [  4  12   2  30]
 [  3   6   3  26]
 [  1   7   6 176]]
macro F1 score: 
 0.38060490553529996
