In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [48]:
df = pd.read_csv("../input/costa-rican-household-poverty-prediction/train.csv")

In [49]:
df.shape

(9557, 143)

## Cleaning Columns

In [50]:
df.columns[df.isnull().sum() > 0]

Index(['v2a1', 'v18q1', 'rez_esc', 'meaneduc', 'SQBmeaned'], dtype='object')

In [51]:
df.fillna(method='bfill',inplace=True)

In [52]:
df.columns[df.isnull().sum() > 0]

Index(['v18q1', 'rez_esc'], dtype='object')

In [53]:
df.fillna(method='ffill',inplace=True)

In [54]:
df.columns[df.isnull().sum() > 0]

Index([], dtype='object')

In [55]:
# finding numeric and categorical columns
Numeric_columns=df.select_dtypes(include=np.number).columns.tolist()
categorical_col=set(df.columns).difference(set(Numeric_columns))
# numeric cols to numeric
print(categorical_col)

{'dependency', 'edjefa', 'Id', 'idhogar', 'edjefe'}


In [56]:
df['edjefe'] = df['edjefe'].replace({'no': 0, 'yes':1}).astype(float)
df['edjefa'] = df['edjefa'].replace({'no': 0, 'yes':1}).astype(float)

In [57]:
# finding numeric and categorical columns
Numeric_columns=df.select_dtypes(include=np.number).columns.tolist()
categorical_col=set(df.columns).difference(set(Numeric_columns))
# numeric cols to numeric
print(categorical_col)

{'Id', 'dependency', 'idhogar'}


In [58]:
df['edjefe'].unique()

array([10., 12.,  0., 11.,  9., 15.,  4.,  6.,  8., 17.,  7., 16., 14.,
        5., 21.,  2., 19.,  1.,  3., 18., 13., 20.])

In [59]:
df['edjefa'].unique()

array([ 0., 11.,  4., 10.,  9., 15.,  7., 14., 13.,  8., 17.,  6.,  5.,
        3., 16., 19.,  1., 21., 12.,  2., 20., 18.])

In [60]:
df['edjefe'].value_counts()

0.0     3762
6.0     1845
11.0     751
9.0      486
3.0      307
15.0     285
8.0      257
7.0      234
5.0      222
14.0     208
17.0     202
2.0      194
4.0      137
16.0     134
1.0      123
12.0     113
10.0     111
13.0     103
21.0      43
18.0      19
19.0      14
20.0       7
Name: edjefe, dtype: int64

In [61]:
df['dependency'] = np.sqrt(df['SQBdependency'])

In [62]:
df.isna().sum().sum()

0

In [63]:
col_drops = ['Id','idhogar']
df.drop(col_drops,axis=1,inplace=True)

In [64]:
# finding numeric and categorical columns
Numeric_columns=df.select_dtypes(include=np.number).columns.tolist()
categorical_col=set(df.columns).difference(set(Numeric_columns))
# numeric cols to numeric
print(categorical_col)

set()


## increasing features

In [65]:
train_set = df

In [66]:
df['adult'] = df['hogar_adul'] - df['hogar_mayor']
df['dependency_count'] = train_set['hogar_nin'] + df['hogar_mayor']
df['dependency'] = df['dependency_count'] / df['adult']
df['child_percent'] = df['hogar_nin']/df['hogar_total']
df['elder_percent'] = df['hogar_mayor']/df['hogar_total']
df['adult_percent'] = df['hogar_adul']/df['hogar_total']

df['rent_per_adult'] = df['v2a1']/df['hogar_adul']
df['rent_per_person'] = df['v2a1']/df['hhsize']

df['overcrowding_room_and_bedroom'] = (df['hacdor'] + df['hacapo'])/2

df['no_appliances'] = df['refrig'] + df['computer'] + df['television']

df['r4h1_percent_in_male'] = df['r4h1'] / df['r4h3']
df['r4m1_percent_in_female'] = df['r4m1'] / df['r4m3']
df['r4h1_percent_in_total'] = df['r4h1'] / df['hhsize']
df['r4m1_percent_in_total'] = df['r4m1'] / df['hhsize']
df['r4t1_percent_in_total'] = df['r4t1'] / df['hhsize']

df['rent_per_room'] = df['v2a1']/df['rooms']
df['bedroom_per_room'] = df['bedrooms']/df['rooms']
df['elder_per_room'] = df['hogar_mayor']/df['rooms']
df['adults_per_room'] = df['adult']/df['rooms']
df['child_per_room'] = df['hogar_nin']/df['rooms']
df['male_per_room'] = df['r4h3']/df['rooms']
df['female_per_room'] = df['r4m3']/df['rooms']
df['room_per_person_household'] = df['hhsize']/df['rooms']

df['rent_per_bedroom'] = df['v2a1']/df['bedrooms']
df['adults_per_bedroom'] = df['adult']/df['bedrooms']
df['child_per_bedroom'] = df['hogar_nin']/df['bedrooms']
df['male_per_bedroom'] = df['r4h3']/df['bedrooms']
df['female_per_bedroom'] = df['r4m3']/df['bedrooms']
df['bedrooms_per_person_household'] = df['hhsize']/df['bedrooms']

df['tablet_per_person_household'] = df['v18q1']/df['hhsize']
df['phone_per_person_household'] = df['qmobilephone']/df['hhsize']

df['age_12_19'] = df['hogar_nin'] - df['r4t1']

df['escolari_age'] = df['escolari']/df['age']

df['rez_esc_escolari'] = df['rez_esc']/df['escolari']
df['rez_esc_r4t1'] = df['rez_esc']/df['r4t1']
df['rez_esc_r4t2'] = df['rez_esc']/df['r4t2']
df['rez_esc_r4t3'] = df['rez_esc']/df['r4t3']
df['rez_esc_age'] = df['rez_esc']/df['age']


In [67]:
from lightgbm import LGBMClassifier

In [68]:
from xgboost import XGBClassifier

In [69]:
from sklearn.model_selection import cross_val_score
#from xgboost import XGBClassifier

In [70]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [71]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [72]:
!pip install bayesian-optimization



In [73]:
x_1 = df.drop('Target',axis=1)

In [74]:
df.shape

(9557, 178)

In [75]:
x_1.shape

(9557, 177)

In [76]:
y = df['Target']

In [77]:
!pip install catboost



In [78]:
from catboost import CatBoostClassifier

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_validate,cross_val_score

In [80]:
clf = CatBoostClassifier()

In [81]:
clf.fit(x_1,y)

Learning rate set to 0.088781
0:	learn: 1.2893543	total: 75.9ms	remaining: 1m 15s
1:	learn: 1.2198498	total: 101ms	remaining: 50.2s
2:	learn: 1.1620522	total: 125ms	remaining: 41.4s
3:	learn: 1.1179098	total: 149ms	remaining: 37.1s
4:	learn: 1.0803137	total: 176ms	remaining: 35s
5:	learn: 1.0486370	total: 203ms	remaining: 33.7s
6:	learn: 1.0175333	total: 227ms	remaining: 32.2s
7:	learn: 0.9913358	total: 251ms	remaining: 31.1s
8:	learn: 0.9711927	total: 274ms	remaining: 30.2s
9:	learn: 0.9540126	total: 296ms	remaining: 29.3s
10:	learn: 0.9357050	total: 321ms	remaining: 28.8s
11:	learn: 0.9220292	total: 341ms	remaining: 28.1s
12:	learn: 0.9088937	total: 363ms	remaining: 27.6s
13:	learn: 0.8973815	total: 390ms	remaining: 27.5s
14:	learn: 0.8850544	total: 416ms	remaining: 27.3s
15:	learn: 0.8743844	total: 440ms	remaining: 27.1s
16:	learn: 0.8658712	total: 463ms	remaining: 26.8s
17:	learn: 0.8576454	total: 485ms	remaining: 26.5s
18:	learn: 0.8493876	total: 509ms	remaining: 26.3s
19:	learn: 

169:	learn: 0.5056941	total: 3.86s	remaining: 18.8s
170:	learn: 0.5044500	total: 3.88s	remaining: 18.8s
171:	learn: 0.5021039	total: 3.9s	remaining: 18.8s
172:	learn: 0.5011219	total: 3.92s	remaining: 18.8s
173:	learn: 0.4997894	total: 3.95s	remaining: 18.7s
174:	learn: 0.4982313	total: 3.97s	remaining: 18.7s
175:	learn: 0.4966500	total: 3.99s	remaining: 18.7s
176:	learn: 0.4962977	total: 4.01s	remaining: 18.6s
177:	learn: 0.4952616	total: 4.03s	remaining: 18.6s
178:	learn: 0.4948911	total: 4.05s	remaining: 18.6s
179:	learn: 0.4938718	total: 4.07s	remaining: 18.6s
180:	learn: 0.4920481	total: 4.09s	remaining: 18.5s
181:	learn: 0.4907938	total: 4.12s	remaining: 18.5s
182:	learn: 0.4889360	total: 4.14s	remaining: 18.5s
183:	learn: 0.4870129	total: 4.16s	remaining: 18.5s
184:	learn: 0.4859946	total: 4.18s	remaining: 18.4s
185:	learn: 0.4853625	total: 4.21s	remaining: 18.4s
186:	learn: 0.4843652	total: 4.23s	remaining: 18.4s
187:	learn: 0.4827304	total: 4.25s	remaining: 18.4s
188:	learn: 0

334:	learn: 0.3488489	total: 7.5s	remaining: 14.9s
335:	learn: 0.3484974	total: 7.53s	remaining: 14.9s
336:	learn: 0.3476388	total: 7.55s	remaining: 14.9s
337:	learn: 0.3470215	total: 7.57s	remaining: 14.8s
338:	learn: 0.3466679	total: 7.59s	remaining: 14.8s
339:	learn: 0.3463420	total: 7.61s	remaining: 14.8s
340:	learn: 0.3455722	total: 7.63s	remaining: 14.7s
341:	learn: 0.3449511	total: 7.65s	remaining: 14.7s
342:	learn: 0.3445724	total: 7.67s	remaining: 14.7s
343:	learn: 0.3440364	total: 7.69s	remaining: 14.7s
344:	learn: 0.3427142	total: 7.72s	remaining: 14.7s
345:	learn: 0.3420517	total: 7.75s	remaining: 14.6s
346:	learn: 0.3415505	total: 7.77s	remaining: 14.6s
347:	learn: 0.3410987	total: 7.79s	remaining: 14.6s
348:	learn: 0.3409790	total: 7.81s	remaining: 14.6s
349:	learn: 0.3400404	total: 7.83s	remaining: 14.5s
350:	learn: 0.3395164	total: 7.85s	remaining: 14.5s
351:	learn: 0.3377462	total: 7.88s	remaining: 14.5s
352:	learn: 0.3368930	total: 7.9s	remaining: 14.5s
353:	learn: 0.

494:	learn: 0.2665752	total: 11.1s	remaining: 11.3s
495:	learn: 0.2663420	total: 11.1s	remaining: 11.3s
496:	learn: 0.2657585	total: 11.1s	remaining: 11.3s
497:	learn: 0.2655678	total: 11.2s	remaining: 11.3s
498:	learn: 0.2646290	total: 11.2s	remaining: 11.2s
499:	learn: 0.2643374	total: 11.2s	remaining: 11.2s
500:	learn: 0.2641374	total: 11.2s	remaining: 11.2s
501:	learn: 0.2639459	total: 11.3s	remaining: 11.2s
502:	learn: 0.2632166	total: 11.3s	remaining: 11.1s
503:	learn: 0.2626158	total: 11.3s	remaining: 11.1s
504:	learn: 0.2621888	total: 11.3s	remaining: 11.1s
505:	learn: 0.2620633	total: 11.3s	remaining: 11.1s
506:	learn: 0.2616467	total: 11.4s	remaining: 11s
507:	learn: 0.2612629	total: 11.4s	remaining: 11s
508:	learn: 0.2610026	total: 11.4s	remaining: 11s
509:	learn: 0.2607143	total: 11.4s	remaining: 11s
510:	learn: 0.2600759	total: 11.5s	remaining: 11s
511:	learn: 0.2596147	total: 11.5s	remaining: 10.9s
512:	learn: 0.2594645	total: 11.5s	remaining: 10.9s
513:	learn: 0.2591231	

656:	learn: 0.2103544	total: 14.9s	remaining: 7.79s
657:	learn: 0.2101949	total: 15s	remaining: 7.77s
658:	learn: 0.2097202	total: 15s	remaining: 7.75s
659:	learn: 0.2089157	total: 15s	remaining: 7.73s
660:	learn: 0.2086561	total: 15s	remaining: 7.7s
661:	learn: 0.2083853	total: 15s	remaining: 7.68s
662:	learn: 0.2081586	total: 15.1s	remaining: 7.66s
663:	learn: 0.2080025	total: 15.1s	remaining: 7.63s
664:	learn: 0.2076774	total: 15.1s	remaining: 7.61s
665:	learn: 0.2068761	total: 15.1s	remaining: 7.59s
666:	learn: 0.2067279	total: 15.1s	remaining: 7.56s
667:	learn: 0.2066064	total: 15.2s	remaining: 7.54s
668:	learn: 0.2063036	total: 15.2s	remaining: 7.51s
669:	learn: 0.2058802	total: 15.2s	remaining: 7.49s
670:	learn: 0.2053045	total: 15.2s	remaining: 7.47s
671:	learn: 0.2048921	total: 15.3s	remaining: 7.45s
672:	learn: 0.2042847	total: 15.3s	remaining: 7.42s
673:	learn: 0.2040548	total: 15.3s	remaining: 7.4s
674:	learn: 0.2036588	total: 15.3s	remaining: 7.38s
675:	learn: 0.2030757	to

815:	learn: 0.1715027	total: 18.8s	remaining: 4.23s
816:	learn: 0.1713481	total: 18.8s	remaining: 4.21s
817:	learn: 0.1712076	total: 18.8s	remaining: 4.18s
818:	learn: 0.1709922	total: 18.8s	remaining: 4.16s
819:	learn: 0.1707655	total: 18.9s	remaining: 4.14s
820:	learn: 0.1704979	total: 18.9s	remaining: 4.12s
821:	learn: 0.1703170	total: 18.9s	remaining: 4.09s
822:	learn: 0.1700928	total: 18.9s	remaining: 4.07s
823:	learn: 0.1699910	total: 18.9s	remaining: 4.04s
824:	learn: 0.1697871	total: 19s	remaining: 4.02s
825:	learn: 0.1694027	total: 19s	remaining: 4s
826:	learn: 0.1693198	total: 19s	remaining: 3.98s
827:	learn: 0.1692167	total: 19s	remaining: 3.95s
828:	learn: 0.1691371	total: 19.1s	remaining: 3.93s
829:	learn: 0.1688513	total: 19.1s	remaining: 3.91s
830:	learn: 0.1682821	total: 19.1s	remaining: 3.88s
831:	learn: 0.1680718	total: 19.1s	remaining: 3.86s
832:	learn: 0.1679638	total: 19.1s	remaining: 3.84s
833:	learn: 0.1674341	total: 19.2s	remaining: 3.81s
834:	learn: 0.1673553	t

982:	learn: 0.1419600	total: 22.4s	remaining: 388ms
983:	learn: 0.1418187	total: 22.4s	remaining: 365ms
984:	learn: 0.1415495	total: 22.5s	remaining: 342ms
985:	learn: 0.1412467	total: 22.5s	remaining: 319ms
986:	learn: 0.1411397	total: 22.5s	remaining: 297ms
987:	learn: 0.1410071	total: 22.5s	remaining: 274ms
988:	learn: 0.1409357	total: 22.6s	remaining: 251ms
989:	learn: 0.1407098	total: 22.6s	remaining: 228ms
990:	learn: 0.1405897	total: 22.6s	remaining: 205ms
991:	learn: 0.1401282	total: 22.6s	remaining: 182ms
992:	learn: 0.1398699	total: 22.6s	remaining: 160ms
993:	learn: 0.1396800	total: 22.7s	remaining: 137ms
994:	learn: 0.1395285	total: 22.7s	remaining: 114ms
995:	learn: 0.1394229	total: 22.7s	remaining: 91.3ms
996:	learn: 0.1393538	total: 22.7s	remaining: 68.4ms
997:	learn: 0.1392621	total: 22.8s	remaining: 45.6ms
998:	learn: 0.1391513	total: 22.8s	remaining: 22.8ms
999:	learn: 0.1389972	total: 22.8s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f3bde9ad890>

In [86]:
df_test = pd.read_csv("../input/costa-rican-household-poverty-prediction/test.csv")
df_test.fillna(method='bfill',inplace=True)
df_test.fillna(method='ffill',inplace=True)
df_test['edjefe'] = df_test['edjefe'].replace({'no': 0, 'yes':1}).astype(float)
df_test['edjefa'] = df_test['edjefa'].replace({'no': 0, 'yes':1}).astype(float)
col_drops = ['Id','idhogar']
df_test.drop(col_drops,axis=1,inplace=True)
df_test['adult'] = df_test['hogar_adul'] - df_test['hogar_mayor']
df_test['dependency_count'] = df_test['hogar_nin'] + df_test['hogar_mayor']
df_test['dependency'] = df_test['dependency_count'] / df_test['adult']
df_test['child_percent'] = df_test['hogar_nin']/df_test['hogar_total']
df_test['elder_percent'] = df_test['hogar_mayor']/df_test['hogar_total']
df_test['adult_percent'] = df_test['hogar_adul']/df_test['hogar_total']

df_test['rent_per_adult'] = df_test['v2a1']/df_test['hogar_adul']
df_test['rent_per_person'] = df_test['v2a1']/df_test['hhsize']

df_test['overcrowding_room_and_bedroom'] = (df_test['hacdor'] + df_test['hacapo'])/2

df_test['no_appliances'] = df_test['refrig'] + df_test['computer'] + df_test['television']

df_test['r4h1_percent_in_male'] = df_test['r4h1'] / df_test['r4h3']
df_test['r4m1_percent_in_female'] = df_test['r4m1'] / df_test['r4m3']
df_test['r4h1_percent_in_total'] = df_test['r4h1'] / df_test['hhsize']
df_test['r4m1_percent_in_total'] = df_test['r4m1'] / df_test['hhsize']
df_test['r4t1_percent_in_total'] = df_test['r4t1'] / df_test['hhsize']

df_test['rent_per_room'] = df_test['v2a1']/df_test['rooms']
df_test['bedroom_per_room'] = df_test['bedrooms']/df['rooms']
df_test['elder_per_room'] = df_test['hogar_mayor']/df_test['rooms']
df_test['adults_per_room'] = df_test['adult']/df_test['rooms']
df_test['child_per_room'] = df_test['hogar_nin']/df_test['rooms']
df_test['male_per_room'] = df_test['r4h3']/df_test['rooms']
df_test['female_per_room'] = df_test['r4m3']/df_test['rooms']
df_test['room_per_person_household'] = df_test['hhsize']/df_test['rooms']

df_test['rent_per_bedroom'] = df_test['v2a1']/df_test['bedrooms']
df_test['adults_per_bedroom'] = df_test['adult']/df_test['bedrooms']
df_test['child_per_bedroom'] = df_test['hogar_nin']/df_test['bedrooms']
df_test['male_per_bedroom'] = df_test['r4h3']/df_test['bedrooms']
df_test['female_per_bedroom'] = df_test['r4m3']/df_test['bedrooms']
df_test['bedrooms_per_person_household'] = df_test['hhsize']/df_test['bedrooms']

df_test['tablet_per_person_household'] = df_test['v18q1']/df_test['hhsize']
df_test['phone_per_person_household'] = df_test['qmobilephone']/df_test['hhsize']

df_test['age_12_19'] = df_test['hogar_nin'] - df_test['r4t1']

df_test['escolari_age'] = df_test['escolari']/df_test['age']

df_test['rez_esc_escolari'] = df_test['rez_esc']/df_test['escolari']
df_test['rez_esc_r4t1'] = df_test['rez_esc']/df_test['r4t1']
df_test['rez_esc_r4t2'] = df_test['rez_esc']/df_test['r4t2']
df_test['rez_esc_r4t3'] = df_test['rez_esc']/df_test['r4t3']
df_test['rez_esc_age'] = df_test['rez_esc']/df_test['age']


In [101]:
df_tt = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')

In [102]:
ids = df_tt['Id']


In [103]:
yy = clf.predict(df_test)

In [104]:
yy.shape

(23856, 1)

In [109]:
pred = []
for i in yy:
    pred.append(int(i))

In [110]:
data = {
    'Id' :ids,
    'Target':pred
}
df_new = pd.DataFrame(data)

In [115]:
df_new.to_csv('submission.csv',index=False)

In [114]:
df_new

Unnamed: 0,Id,Target
0,ID_2f6873615,4
1,ID_1c78846d2,4
2,ID_e5442cf6a,4
3,ID_a8db26a79,4
4,ID_a62966799,4
...,...,...
23851,ID_a065a7cad,2
23852,ID_1a7c6953b,2
23853,ID_07dbb4be2,2
23854,ID_34d2ed046,2


In [None]:
def get_stacking():
    level0 = list()
    level0.append(('XGBOOST', XGBClassifier()))
    level0.append(('LGBM',LGBMClassifier(n_estimators=100,learning_rate=0.1,
                            random_state=42,num_leaves=200)))
    # level0.append(('extra',ExtraTreesRegressor(n_estimators=int(366.84350004540295),
    #                      min_samples_split=int(2.0),
    #                      max_features=0.999,
    #                      random_state = 2)))
    level0.append(('Cataboost',CatBoostClassifier(depth=int(9.253),n_estimators=int(514.1))))

#     level0.append(('Adaboost',AdaBoostRegressor(random_state=0,n_estimators=1000,learning_rate=0.01)))
#     level0.append(('SVM', SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)))
#     level0.append(('cart', RandomForestRegressor()))
#     level0.append(('cart', GradientBoostingRegressor()))
#     level0.append(('knn', KNeighborsRegressor()))
    # level0.append(('BaggingRegressor',BaggingRegressor(n_estimators=395,max_features=13)))
    level1=LogisticRegression()
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

def get_models():
	models = dict()
	# models['XGBOOST'] = XGBClassifier(n_estimators=400,learning_rate=0.045,max_depth=7,
  #             max_bin=200,booster='gbtree')
	# models['LGBM']=LGBMClassifier(min_child_samples=int(24.763103326425835), n_estimators=int(496.83258647534456), 
  #                                           num_leaves=int(77.94008810764333), subsample=0.9167856541166559)
# 	models['CatBoostRegressor']=CatBoostRegressor()
# 	models['Adaboost']=AdaBoostRegressor(random_state=0,n_estimators=1000,learning_rate=0.01
# 	models['cart1'] = DecisionTreeRegressor()
# 	models['cart'] = GradientBoostingRegressor()
# 	models['SVM'] = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
# 	models['knn'] = KNeighborsRegressor()

	models['stacking'] = get_stacking()
	return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_validate(model, X, y, scoring=('accuracy','f1'
                                                 ),cv=cv, n_jobs=-1, error_score='raise')
	return scores
 
# define dataset
X, y = x_1,y
# get the models to evaluate
models = get_models()
# evaluate the modelns and store results
results, names = list(), list()
for name, model in models.items():
	# start=time.time()
	scores = evaluate_model(model, X, y)
	print(scores)
	# end=time.time()
# 	results.append(scores)
	names.append(name)
	print('>%s %.3f' % (name,end-start))