In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
import seaborn as sns
from sklearn.model_selection import cross_val_score

original data

In [2]:
df = pd.read_csv('adult.data')

In [3]:
df.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


modified data for model building

In [4]:
data = pd.read_csv('adult_cleaned.csv')

In [5]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
data.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
2470,18,5,57413,0,10,4,13.0,5,3,0,0,0,15,1,0
10503,37,1,34996,3,9,3,13.0,3,3,1,0,0,40,1,0
15050,46,5,224202,0,10,2,10.0,4,3,0,0,0,50,1,0
29194,34,5,167893,0,10,2,5.0,4,3,0,0,0,64,1,1
12297,59,5,182062,12,13,2,5.0,4,3,0,5013,0,40,1,0


In [7]:
data.dropna(inplace=True)

In [8]:
x = data.drop(['income'],axis=1)
y = data['income']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(x,y,train_size=0.33,random_state=7)

In [11]:
data.shape

(29517, 15)

In [12]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
14791,38,5,275223,0,10,2,10.0,4,3,0,7298,0,40,1
21102,28,5,214858,3,9,2,11.0,4,3,0,0,0,40,1
5954,39,4,106297,3,9,4,3.0,5,3,0,0,0,42,1
23523,47,5,298037,14,11,4,5.0,3,3,1,0,0,44,1
2717,42,5,121264,14,11,2,5.0,4,3,0,0,0,40,1


# random forest classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  accuracy_score

In [14]:
Rf_model = RandomForestClassifier()

In [15]:
Rf_model.fit(X_train,y_train)

RandomForestClassifier()

In [16]:
y_predict_Rf = Rf_model.predict(X_test)

In [17]:
accuracy_score(y_test,y_predict_Rf)

0.8536178389037771

Hyperparameter tuning

In [18]:
grid_param = {
    "n_estimators":[90,100,115,130],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features':['auto','log2']
}

In [19]:
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(estimator=Rf_model,param_grid=grid_param,cv=3,verbose=2,n_jobs=-1)

In [20]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 20736 candidates, totalling 62208 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 20),
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'n_estimators': [90, 100, 115, 130]},
             verbose=2)

In [21]:
#best parameter 
grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 17,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 100}

Model with best parameter

In [22]:
Rf_model_best_params = RandomForestClassifier(n_estimators=130,
    criterion= 'entropy',
    max_depth= 15,
    min_samples_leaf= 1,
    min_samples_split= 8,
    max_features='log2')

In [23]:
Rf_model_best_params.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=15, max_features='log2',
                       min_samples_split=8, n_estimators=130)

In [24]:
y_predict_best = Rf_model_best_params.predict(X_test)

In [25]:
accuracy_score(y_test,y_predict_best)

0.8595843656773019

we had increased accuracy by just 1 percent approx

# Bagging Classifier

In [43]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [44]:
model = BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=100,max_samples=0.25,bootstrap=True, random_state=7).fit(X_train, y_train)

In [45]:
y_test_predict = model.predict(X_test)

In [46]:
accuracy_score(y_test,y_test_predict)

0.8568033574354048

In [47]:
results = cross_val_score(model, X_train, y_train, cv = 20)
print("accuracy :")
print(results.mean())

accuracy :
0.850924024640657


# Extra tree classifier

In [48]:
from sklearn.ensemble import ExtraTreesClassifier

In [49]:
model = ExtraTreesClassifier(n_estimators=100, random_state=7).fit(X_train, y_train)

In [50]:
y_predict_extra = model.predict(X_test)

In [51]:
accuracy_score(y_test,y_predict_extra)

0.8426454972948374

In [52]:
results = cross_val_score(model, X_train, y_train, cv = 20)
print("accuracy :")
print(results.mean())

accuracy :
0.8395277207392198


# voting classifier

In [53]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [54]:
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = SVC(probability=True)
model1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2) , ('svc',clf3)], voting='hard').fit(X_train,y_train)
model2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),('svc',clf3)],voting='soft').fit(X_train,y_train)

In [55]:
y_predict_model1 = model1.predict(X_test)
y_predict_model2 = model2.predict(X_test)

In [56]:
accuracy_score(y_test,y_predict_model1)

0.8097284724680184

In [57]:
accuracy_score(y_test,y_predict_model2)

0.8113465136269404

In [58]:
results = cross_val_score(model1, X_train, y_train, cv = 20)
print("accuracy :")
print(results.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

accuracy :
0.8093429158110883
