In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder

from smoker_status.config import RAW_DATA_DIR

In [12]:
df_train = pd.read_csv(RAW_DATA_DIR / 'train.csv')
df_test = pd.read_csv(RAW_DATA_DIR / 'test.csv')

df_train.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1


In [13]:
X: pd.DataFrame = df_train.drop(['smoking', 'id'], axis=1)
y: pd.Series = df_train['smoking']

Looking at dataset, looks like the following features are classes:
`hearing(left)`, `hearing(right)`, `Urine protein`, `dental caries`

In [14]:
cat_feats = [
    'hearing(left)',
    'hearing(right)',
    'Urine protein',
    'dental caries',
]
X_no_cat_feat = X.drop(cat_feats, axis=1)
X_only_cat_feat = X[cat_feats]

enc = OneHotEncoder()
enc.fit(X_only_cat_feat)
X_only_cat_feat_trans = pd.DataFrame(
    data=enc.transform(X_only_cat_feat).toarray(),
    columns=[
        'hearing(left) - normal',
        'hearing(left) - abnormal',
        'hearing(right) - normal',
        'hearing(right) - abnormal',
        'Urine protein -',
        'Urine protein +/-',
        'Urine protein +1',
        'Urine protein +2',
        'Urine protein +3',
        'Urine protein +4',
        'dental caries - nonpresent',
        'dental caries - present',
    ],
)
X_trans = pd.concat([X_no_cat_feat, X_only_cat_feat_trans], axis=1)
X_trans.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),systolic,relaxation,fasting blood sugar,Cholesterol,...,hearing(right) - normal,hearing(right) - abnormal,Urine protein -,Urine protein +/-,Urine protein +1,Urine protein +2,Urine protein +3,Urine protein +4,dental caries - nonpresent,dental caries - present
0,55,165,60,81.0,0.5,0.6,135,87,94,172,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,70,165,65,89.0,0.6,0.7,146,83,147,194,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,20,170,75,81.0,0.4,0.5,118,75,79,178,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,35,180,95,105.0,1.5,1.2,131,88,91,180,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,30,165,60,80.5,1.5,1.0,121,76,91,155,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_trans, y)

In [26]:
scores = cross_val_score(clf, X_trans, y, cv=10, scoring='accuracy', n_jobs=15)
scores.mean()

np.float64(0.7749974925238018)