In [59]:
pip install SMOTE

Note: you may need to restart the kernel to use updated packages.


In [60]:
import pandas
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import warnings
import joblib
import json

### Load data

In [61]:
data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
data_test_rf = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
data_train_full.head(5)

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_own-child,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26890,3.0,16.0,5,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
933,3.0,14.0,5,4.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
25596,2.0,10.0,5,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12949,2.0,13.0,5,0.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6681,1.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Train model

- Train model with best parameter and using class-weight
```
'n_estimators': 350,
 'max_depth': 13,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'criterion': 'entropy',
 'max_features': 0.475336315954348
 ```

In [62]:
X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

In [63]:
print(f"Original training data shape: {X_full.shape}")
print(f"Original label distribution:\n{y_full.value_counts()}")

Original training data shape: (16720, 48)
Original label distribution:
label
0.0    9719
1.0    7001
Name: count, dtype: int64


- SMOTE

As we see that's the class label is imbalance so we try to use SMOTE to resample and make it balance.


In [64]:
warnings.filterwarnings('ignore')
smote = SMOTE(random_state=42,n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full,y_full)

In [65]:
print(f"New resampled training data shape: {X_resampled.shape}")
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

New resampled training data shape: (19438, 48)
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64


### Best parameter

In [66]:
best_rf_params = {
    'n_estimators': 350,
    'max_depth': 13,
    'min_samples_leaf': 2,
    'min_samples_split': 6,
    'criterion': 'entropy',
    'max_features': 0.475336315954348

}

In [67]:
rf_model_SMOTE = RandomForestClassifier(
    **best_rf_params,
    random_state=42,
    n_jobs=-1
)
rf_model_SMOTE.fit(X_resampled, y_resampled)

### Calculate and add prediction column

In [68]:
data_test_rf['prediction'] = rf_model_SMOTE.predict(
    data_test_rf.drop(['label'], axis='columns')
)


In [69]:
data_test_rf

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female,label,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,1.0,13.0,5,2.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1460,0.0,1.0,3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13594,2.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
14400,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14333,2.0,9.0,5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21121,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
12348,2.0,13.0,5,4.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
22298,3.0,13.0,5,0.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
15636,1.0,10.0,5,3.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [70]:
data_test_rf.to_csv('./results/predictions.rf.SMOTE.csv')

### Evaluate result

In [71]:
sklearn.metrics.confusion_matrix(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction'],
)

array([[1924,  492],
       [ 254, 1510]], dtype=int64)

In [72]:
report_scores = sklearn.metrics.classification_report(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction'],
    digits = 6,
    output_dict = True
)
df_score = pandas.DataFrame(report_scores).transpose()
df_score

Unnamed: 0,precision,recall,f1-score,support
0.0,0.883379,0.796358,0.837614,2416.0
1.0,0.754246,0.856009,0.801912,1764.0
accuracy,0.821531,0.821531,0.821531,0.821531
macro avg,0.818813,0.826183,0.819763,4180.0
weighted avg,0.828884,0.821531,0.822548,4180.0


In [73]:
joblib.dump(
    value = rf_model_SMOTE,
    filename = './model/rf_model.SMOTE.joblib'
)
with open('./results/scores.rf.SMOTE.json','w')as f:
    json.dump(
        obj=report_scores,
        fp=f,
        indent = 4
    )