In [1]:
import numpy as np
import pandas as pd

In [2]:
TRAIN_SET_PATH = './datasets/Train_v2.csv'
TEST_SET_PATH = './datasets/Test_v2.csv'

In [3]:
df = pd.read_csv(TRAIN_SET_PATH)
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [4]:
df.describe()

Unnamed: 0,year,household_size,age_of_respondent
count,23524.0,23524.0,23524.0
mean,2016.975939,3.797483,38.80522
std,0.847371,2.227613,16.520569
min,2016.0,1.0,16.0
25%,2016.0,2.0,26.0
50%,2017.0,3.0,35.0
75%,2018.0,5.0,49.0
max,2018.0,21.0,100.0


In [5]:
from sklearn.utils import shuffle
new_df = shuffle(df)

new_df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
17062,Tanzania,2017,uniqueid_2260,No,Urban,Yes,2,29,Female,Spouse,Single/Never Married,Primary education,Informally employed
1997,Kenya,2018,uniqueid_1998,No,Rural,Yes,4,33,Male,Child,Married/Living together,Primary education,Informally employed
15067,Tanzania,2017,uniqueid_265,No,Rural,No,2,28,Female,Head of Household,Single/Never Married,Primary education,Self employed
7053,Rwanda,2016,uniqueid_986,No,Rural,Yes,5,58,Female,Spouse,Married/Living together,No formal education,Informally employed
11307,Rwanda,2016,uniqueid_5240,No,Rural,Yes,5,37,Female,Spouse,Married/Living together,Primary education,Farming and Fishing


In [6]:
train_data = new_df.copy()

In [7]:
train_data = train_data.drop(columns=['cellphone_access', 'household_size'])
y = train_data['uniqueid']

In [8]:
y.head()

17062    uniqueid_2260
1997     uniqueid_1998
15067     uniqueid_265
7053      uniqueid_986
11307    uniqueid_5240
Name: uniqueid, dtype: object

In [9]:
X = train_data.drop(columns=['year'])
X['relationship'] = X['relationship_with_head'] + X['marital_status']

X = X.drop(columns=['relationship_with_head','marital_status'])

In [10]:
X.head()

Unnamed: 0,country,uniqueid,bank_account,location_type,age_of_respondent,gender_of_respondent,education_level,job_type,relationship
17062,Tanzania,uniqueid_2260,No,Urban,29,Female,Primary education,Informally employed,SpouseSingle/Never Married
1997,Kenya,uniqueid_1998,No,Rural,33,Male,Primary education,Informally employed,ChildMarried/Living together
15067,Tanzania,uniqueid_265,No,Rural,28,Female,Primary education,Self employed,Head of HouseholdSingle/Never Married
7053,Rwanda,uniqueid_986,No,Rural,58,Female,No formal education,Informally employed,SpouseMarried/Living together
11307,Rwanda,uniqueid_5240,No,Rural,37,Female,Primary education,Farming and Fishing,SpouseMarried/Living together


In [11]:
from sklearn.preprocessing import LabelBinarizer

In [12]:
label_binarizer = LabelBinarizer()
y_train = label_binarizer.fit_transform(y)

In [13]:
from sklearn.feature_extraction import DictVectorizer
 
def encode_onehot(df, cols):
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict('records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

In [14]:
X_train = encode_onehot(X, cols=X.columns)
X_train.shape

(23524, 8789)

In [15]:
y_train.shape

(23524, 8735)

In [16]:
train_data_2 = new_df.copy()
train_data_2.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
17062,Tanzania,2017,uniqueid_2260,No,Urban,Yes,2,29,Female,Spouse,Single/Never Married,Primary education,Informally employed
1997,Kenya,2018,uniqueid_1998,No,Rural,Yes,4,33,Male,Child,Married/Living together,Primary education,Informally employed
15067,Tanzania,2017,uniqueid_265,No,Rural,No,2,28,Female,Head of Household,Single/Never Married,Primary education,Self employed
7053,Rwanda,2016,uniqueid_986,No,Rural,Yes,5,58,Female,Spouse,Married/Living together,No formal education,Informally employed
11307,Rwanda,2016,uniqueid_5240,No,Rural,Yes,5,37,Female,Spouse,Married/Living together,Primary education,Farming and Fishing


In [None]:
X_2 = train_data_2.drop(columns=[
    'year', 'household_size', 'relationship_with_head', 'marital_status'
])
X_train_2 = encode_onehot(X_2, cols=X_2.columns)
X_train_2.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_one = RandomForestClassifier()
model_two = RandomForestClassifier()

model_one.fit(X_train, y_train)
model_two.fit(X_train_2, y_train)

In [None]:
test_data = pd.read_csv('./datasets/Test_v2.csv')

X_test = test_data.drop(columns=[
    'year', 'household_size', 'relationship_with_head', 'marital_status'
])

X_test_one_hot = encode_onehot(X_test, cols=X_test.columns)

In [None]:
test_data.head()

In [None]:
y_2 = test_data.uniqueid

y_test = label_binarizer.fit_transform(y_2)