In [1]:
import pandas as pd
import numpy as np

import pickle

dataset = pd.read_csv('train.csv')



In [2]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

dataset['wage_class'] = le.fit_transform(dataset['wage_class'])

dataset = dataset.replace('?', np.nan)

columns_with_nan = ['workclass', 'occupation', 'native_country']

for col in columns_with_nan:
    dataset[col].fillna(dataset[col].mode()[0], inplace=True)
dataset.head()


Unnamed: 0,Person_Id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,1,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,2,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,3,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,4,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,5,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
dataset["marital_status"].value_counts()

 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital_status, dtype: int64

In [4]:
dataset['age'].value_counts()

36    898
31    888
34    886
23    877
35    876
     ... 
83      6
85      3
88      3
86      1
87      1
Name: age, Length: 73, dtype: int64

In [5]:
dataset = dataset.replace({"marital_status":{'Married-civ-spouse' : 1,
                             'Never-married' : 2,
                             'Divorced' : 3,
                             'Separated' : 4,
                             'Widowed' : 5,
                             'Married-spouse-absent' : 6,
                             'Married-AF-spouse' : 7
                             }})
for col in dataset.columns:
    if dataset[col].dtypes == 'object':
        encoder = LabelEncoder()
        dataset[col] = encoder.fit_transform(dataset[col])



In [6]:
X = dataset.drop('wage_class', axis=1)
Y = dataset['wage_class']

X = X.drop(["Person_Id",'workclass', 'education', 'race', 'sex',
            'capital_loss', 'native_country', 'fnlwgt', 'relationship',
            ], axis=1)

X.head()

Unnamed: 0,age,education_num,marital_status,occupation,capital_gain,hours_per_week
0,39,13,4,1,2174,40
1,50,13,2,4,0,13
2,38,9,0,6,0,40
3,53,7,2,6,0,40
4,28,13,2,10,0,40


In [7]:
X["capital_gain"].value_counts()

0        29849
15024      347
7688       284
7298       246
99999      159
         ...  
1639         1
5060         1
6097         1
1455         1
7978         1
Name: capital_gain, Length: 119, dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = scaler.fit_transform(X)

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)



In [9]:
ros.fit(X, Y)

X_resampled, Y_resampled = ros.fit_resample(X, Y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.3, random_state=0)


In [10]:

from sklearn.ensemble import RandomForestClassifier
ran = RandomForestClassifier(random_state=0)

In [11]:
from sklearn.model_selection import RandomizedSearchCV

In [12]:
n_estimators = [int(x) for x in np.linspace(start=40, stop=150, num=15)]
max_depth = [int(x) for x in np.linspace(40, 150, num=15)]

In [13]:
param_dist = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
}

In [14]:
rf_tuned = RandomForestClassifier(random_state=42)

In [15]:
rf_cv = RandomizedSearchCV(
    estimator=rf_tuned, param_distributions=param_dist, cv=5, random_state=42)

In [16]:
rf_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
                   param_distributions={'max_depth': [40, 47, 55, 63, 71, 79,
                                                      87, 95, 102, 110, 118,
                                                      126, 134, 142, 150],
                                        'n_estimators': [40, 47, 55, 63, 71, 79,
                                                         87, 95, 102, 110, 118,
                                                         126, 134, 142, 150]},
                   random_state=42)

In [17]:
rf_cv.best_score_

0.8808367609001759

In [18]:
rf_best = RandomForestClassifier(
    max_depth=102, n_estimators=40, random_state=42)

In [19]:
rf_best.fit(X_train, y_train)

RandomForestClassifier(max_depth=102, n_estimators=40, random_state=42)

In [20]:
Y_pred_rf_best = rf_best.predict(X_test)
Y_pred_rf_best_for_train = rf_best.predict(X_train)

In [21]:
from sklearn.metrics import accuracy_score
print('Random Forest Classifier:')
print("Accourcy score with all feature on test_data: {0:0.4f}".format(accuracy_score(y_test,Y_pred_rf_best)))


Random Forest Classifier:
Accourcy score with all feature on test_data: 0.8877


In [23]:
pickle.dump(rf_best, open('classifier.pkl','wb'))

model = pickle.load(open('classifier.pkl','rb'))

prediction = model.predict(scaler.transform(np.array([[52,9,1,3,15024,40]])))
print(prediction)

[1]
