In [11]:
import pandas as pd
import joblib

import sklearn
import json as json
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

### Load data

In [2]:
train_df = pd.read_csv('../data/salary.train.processed.csv', index_col='id')
test_df = pd.read_csv('../data/salary.test.processed.csv', index_col='id')

- Split train and test data

In [3]:
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_train.head(5)

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_wife,relationship_own-child,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26890,3.0,16.0,5,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
933,3.0,14.0,5,4.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
25596,2.0,10.0,5,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
12949,2.0,13.0,5,0.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6681,1.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']
X_test.head(5)

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_wife,relationship_own-child,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,1.0,13.0,5,2.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1460,0.0,1.0,3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
13594,2.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
14400,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
14333,2.0,9.0,5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


---

### Default model

- Train with default parameter 
```
`n_estimators = 100`,
`random_state = 42`
```

In [5]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

In [6]:
joblib.dump(
    value = log_reg,
    filename = './model/base_Logreg_model.joblib'
)

['./model/base_Logreg_model.joblib']

### Evaluation

- Calculate and add prediction column

In [7]:
log_reg.predict(X = test_df.drop(['label'],axis='columns'))

array([1., 0., 0., ..., 1., 1., 0.])

In [8]:
test_df['prediction']=log_reg.predict(
X = test_df.drop(columns = ['label']),
)
test_df.head(5)

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female,label,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,1.0,13.0,5,2.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1460,0.0,1.0,3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13594,2.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
14400,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14333,2.0,9.0,5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [9]:
test_df.to_csv('./results/predictions.base.model.csv')

- Calculate confusion metrix

In [13]:
sklearn.metrics.confusion_matrix(
    y_true=test_df['label'],
    y_pred=test_df['prediction'],
)

array([[2048,  368],
       [ 383, 1381]], dtype=int64)

In [14]:
report_scores = sklearn.metrics.classification_report(
    y_true=test_df['label'],
    y_pred=test_df['prediction'],
    digits = 6,
    output_dict = True
)
df_score = pd.DataFrame(report_scores).transpose()
df_score

Unnamed: 0,precision,recall,f1-score,support
0.0,0.842452,0.847682,0.845059,2416.0
1.0,0.789594,0.78288,0.786223,1764.0
accuracy,0.820335,0.820335,0.820335,0.820335
macro avg,0.816023,0.815281,0.815641,4180.0
weighted avg,0.820145,0.820335,0.820229,4180.0
