In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")
np.random.seed(100)
%matplotlib inline

In [2]:
df=pd.read_csv("adult.csv",na_values=['#NAME?'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.isnull().sum()

age                48
workclass           0
fnlwgt            107
education           0
education_num      57
marital_status      0
occupation          0
relationship        0
race              264
sex                47
capital_gain        0
capital_loss        0
hours_per_week      0
native_country      0
income              0
dtype: int64

In [4]:
df.workclass.unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay'], dtype=object)

In [5]:
df['workclass']=df['workclass'].replace('?','Unknown')
df['education']=df['education'].replace('?','Unknown')
df['education_num']=df['education_num'].fillna(0)

In [6]:
print(df['education'].unique())
print(df['education_num'].unique())

['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Unknown' 'Prof-school' '5th-6th'
 '10th' '1st-4th' 'Preschool' '12th']
[13.  9.  7. 14.  5. 10. 12. 11.  4. 16.  0. 15.  3.  6.  2.  1.  8.]


In [7]:
df=df.dropna(axis=0,subset=['sex','age'])

In [8]:
df.isna().sum()

age                 0
workclass           0
fnlwgt            106
education           0
education_num       0
marital_status      0
occupation          0
relationship        0
race              259
sex                 0
capital_gain        0
capital_loss        0
hours_per_week      0
native_country      0
income              0
dtype: int64

In [9]:
df.race.value_counts()

White                 3948
Black                  479
Asian-Pac-Islander     144
Amer-Indian-Eskimo      47
Other                   28
Name: race, dtype: int64

In [10]:
df['race']=df['race'].fillna(df.race.value_counts().index[0])

In [11]:
df.isna().sum()

age                 0
workclass           0
fnlwgt            106
education           0
education_num       0
marital_status      0
occupation          0
relationship        0
race                0
sex                 0
capital_gain        0
capital_loss        0
hours_per_week      0
native_country      0
income              0
dtype: int64

In [12]:
df['fnlwgt']=df['fnlwgt'].fillna(df['fnlwgt'].mean())

In [13]:
b=df['education_num'].unique()

for i,y in (zip(df['education'],df['education_num'])):
    if(i == 'Unknown'):
        a = 0
    elif(i == 'Preschool'):
        a = 1
    elif(i == '1st-4th'):
        a = 2
    elif(i == '5th-6th'):
        a = 3
    elif(i == '7th-8th'):
        a = 4
    elif(i == '9th'):
        a = 5
    elif(i == '10th'):
        a = 6
    elif(i == '11th'):
        a = 7
    elif(i == '12th'):
        a = 8
    elif(i == 'HS-grad'):
        a = 9
    elif(i == 'Some-college'):
        a = 10
    elif(i == 'Assoc-voc'):
        a = 11
    elif(i == 'Assoc-acdm'):
        a = 12
    elif(i == 'Bachelors'):
        a = 13
    elif(i == 'Masters'):
        a = 14
    elif('Prof-school'):
        a = 15
    elif('Doctorate'):
        a = 16
        
    if(a != int(y)):
        print (1)

In [14]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [15]:
print(df['workclass'].unique())
print(df['occupation'].unique())
print(df['native_country'].unique())
print(df['relationship'].unique())
print(df['race'].unique())
print(df['sex'].unique())

['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov'
 'Unknown' 'Self-emp-inc' 'Without-pay']
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Machine-op-inspct' 'Farming-fishing' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'France'
 'El-Salvador' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland']
['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
['Male' 'Female']


In [16]:
df=df.drop("education_num",axis=1)

In [17]:
df['occupation']=df['occupation'].replace('?','Unknown')
df['native_country']=df['native_country'].replace('?','Unknown')

In [18]:
print(df['native_country'].unique())

['United-States' 'Cuba' 'Jamaica' 'India' 'Unknown' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'France'
 'El-Salvador' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland']


In [19]:
categor=pd.get_dummies(df[['workclass','education','marital_status','native_country','race','sex','relationship','occupation']])

In [20]:
data = pd.concat((df.drop(['workclass','education','marital_status','native_country','race','sex','relationship','occupation'],axis=1),categor),axis=1)

In [21]:
y=data.pop('income')

In [22]:
y.head()

0    <=50K
1    <=50K
2    <=50K
4    <=50K
5    <=50K
Name: income, dtype: object

## Scaling the data

In [23]:
std=StandardScaler()
scaled=std.fit_transform(data)

In [24]:
SX_train,SX_test,sy_train,sy_test=train_test_split(scaled,y,test_size=0.2)

## Logistic Regression

In [25]:
X_train,X_test,y_train,y_test=train_test_split(data,y,test_size=0.2)

In [26]:
log=LogisticRegression()
log.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
y_pred=log.predict(X_test)

In [28]:
print("Accuracy Score in Logistic Regression:",accuracy_score(y_test,y_pred))

Accuracy Score in Logistic Regression: 0.7798165137614679


### With Scaling

In [29]:
log.fit(SX_train,sy_train)
sy_pred=log.predict(SX_test)

In [30]:
print("Accuracy Score in Logistic Regression with Scaling:",accuracy_score(sy_test,sy_pred))

Accuracy Score in Logistic Regression with Scaling: 0.8379204892966361


## Decision Tree

In [31]:
dt=DecisionTreeClassifier()
X_train,X_test,y_train,y_test=train_test_split(data,y,test_size=0.2)

In [32]:
dt.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [33]:
y_pred=dt.predict(X_test)
print("Accuracy Score in Decision Tree:",accuracy_score(y_test,y_pred))

Accuracy Score in Decision Tree: 0.7900101936799184


### With Scaling

In [34]:
log.fit(SX_train,sy_train)
y_pred=log.predict(SX_test)

In [35]:
print("Accuracy Score in Decision Tree with Scaling:",accuracy_score(sy_test,sy_pred))

Accuracy Score in Decision Tree with Scaling: 0.8379204892966361


## Random Forest

In [36]:
rf=RandomForestClassifier()
X_train,X_test,y_train,y_test=train_test_split(data,y,test_size=0.2)

In [37]:
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

In [38]:
print("Accuracy Score in  Random Forest:",accuracy_score(y_test,y_pred))

Accuracy Score in  Random Forest: 0.8501529051987767


### With Scaling

In [39]:
log.fit(SX_train,sy_train)
sy_pred=log.predict(SX_test)

In [40]:
print("Accuracy Score in Random Forest with Scaling:",accuracy_score(sy_test,sy_pred))

Accuracy Score in Random Forest with Scaling: 0.8379204892966361


## kNN Classifier

In [41]:
X_train,X_test,y_train,y_test=train_test_split(data,y,test_size=0.2)
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [42]:
y_pred=knn.predict(X_test)

In [43]:
print("Accuracy Score in kNN:",accuracy_score(y_test,y_pred))

Accuracy Score in kNN: 0.7543323139653415


### With Scaling

In [44]:
log.fit(SX_train,sy_train)
sy_pred=log.predict(SX_test)

In [45]:
print("Accuracy Score in kNN with Scaling:",accuracy_score(sy_test,sy_pred))

Accuracy Score in kNN with Scaling: 0.8379204892966361


## Tuning the parameters:

In [46]:
def fit_knn (n_neighbors):
    knn=KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(SX_train,sy_train)
    sy_pred=knn.predict(SX_test)
    print(accuracy_score(sy_test,sy_pred))

In [47]:
for i in range(1,50):
    print("Accuracy using K =",i,"\t")
    fit_knn(i)

Accuracy using K = 1 	
0.7298674821610601
Accuracy using K = 2 	
0.764525993883792
Accuracy using K = 3 	
0.7828746177370031
Accuracy using K = 4 	
0.7828746177370031
Accuracy using K = 5 	
0.7910295616717635
Accuracy using K = 6 	
0.7859327217125383
Accuracy using K = 7 	
0.7869520897043832
Accuracy using K = 8 	
0.7900101936799184
Accuracy using K = 9 	
0.7940876656472987
Accuracy using K = 10 	
0.8053007135575942
Accuracy using K = 11 	
0.8093781855249745
Accuracy using K = 12 	
0.8022426095820592
Accuracy using K = 13 	
0.8042813455657493
Accuracy using K = 14 	
0.7961264016309888
Accuracy using K = 15 	
0.7981651376146789
Accuracy using K = 16 	
0.7951070336391437
Accuracy using K = 17 	
0.7961264016309888
Accuracy using K = 18 	
0.7961264016309888
Accuracy using K = 19 	
0.7981651376146789
Accuracy using K = 20 	
0.7981651376146789
Accuracy using K = 21 	
0.8083588175331294
Accuracy using K = 22 	
0.7981651376146789
Accuracy using K = 23 	
0.8103975535168195
Accuracy using K = 24

In [48]:
def fit_predict(train,test,y_train,y_test,n_estimators,max_depth=None,min_samples_leaf=1):
    rf=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_leaf=min_samples_leaf)
    rf.fit(train,y_train)
    y_pred=rf.predict(test)
    print("Accuracy Score:",accuracy_score(y_test,y_pred))

In [49]:
for i in [400,600,800,1000]:
    print("Accuracy Score with estimators:",i)
    fit_predict(X_train,X_test,y_train,y_test,n_estimators=i)

Accuracy Score with estimators: 400
Accuracy Score: 0.8328236493374108
Accuracy Score with estimators: 600
Accuracy Score: 0.8267074413863404
Accuracy Score with estimators: 800
Accuracy Score: 0.8277268093781855
Accuracy Score with estimators: 1000
Accuracy Score: 0.8318042813455657


In [50]:
for i in [500,550,600,650,700,750,800,850,900,950,1000]:
    print("Accuracy Score with estimators:",i)
    fit_predict(X_train,X_test,y_train,y_test,n_estimators=i)

Accuracy Score with estimators: 500
Accuracy Score: 0.8256880733944955
Accuracy Score with estimators: 550
Accuracy Score: 0.8277268093781855
Accuracy Score with estimators: 600
Accuracy Score: 0.8277268093781855
Accuracy Score with estimators: 650
Accuracy Score: 0.8287461773700305
Accuracy Score with estimators: 700
Accuracy Score: 0.8307849133537207
Accuracy Score with estimators: 750
Accuracy Score: 0.8277268093781855
Accuracy Score with estimators: 800
Accuracy Score: 0.8297655453618756
Accuracy Score with estimators: 850
Accuracy Score: 0.8297655453618756
Accuracy Score with estimators: 900
Accuracy Score: 0.8287461773700305
Accuracy Score with estimators: 950
Accuracy Score: 0.8246687054026504
Accuracy Score with estimators: 1000
Accuracy Score: 0.8277268093781855


In [51]:
for i in [400,430,450,480,510]:
    print("Accuracy Score with estimators:",i)
    fit_predict(X_train,X_test,y_train,y_test,n_estimators=i)

Accuracy Score with estimators: 400
Accuracy Score: 0.8287461773700305
Accuracy Score with estimators: 430
Accuracy Score: 0.8297655453618756
Accuracy Score with estimators: 450
Accuracy Score: 0.8256880733944955
Accuracy Score with estimators: 480
Accuracy Score: 0.8287461773700305
Accuracy Score with estimators: 510
Accuracy Score: 0.8287461773700305


In [52]:
for i in [400,402,404,406,408]:
    print("Accuracy Score with estimators:",i)
    fit_predict(X_train,X_test,y_train,y_test,n_estimators=i)

Accuracy Score with estimators: 400
Accuracy Score: 0.8287461773700305
Accuracy Score with estimators: 402
Accuracy Score: 0.8297655453618756
Accuracy Score with estimators: 404
Accuracy Score: 0.8297655453618756
Accuracy Score with estimators: 406
Accuracy Score: 0.8246687054026504
Accuracy Score with estimators: 408
Accuracy Score: 0.8297655453618756


In [53]:
for i in range(3,25):
    print("Accuracy with Max_depth:",i)
    fit_predict(X_train,X_test,y_train,y_test,n_estimators=406,max_depth=i)

Accuracy with Max_depth: 3
Accuracy Score: 0.7900101936799184
Accuracy with Max_depth: 4
Accuracy Score: 0.8042813455657493
Accuracy with Max_depth: 5
Accuracy Score: 0.8216106014271152
Accuracy with Max_depth: 6
Accuracy Score: 0.8348623853211009
Accuracy with Max_depth: 7
Accuracy Score: 0.836901121304791
Accuracy with Max_depth: 8
Accuracy Score: 0.8419979612640163
Accuracy with Max_depth: 9
Accuracy Score: 0.8419979612640163
Accuracy with Max_depth: 10
Accuracy Score: 0.8481141692150866
Accuracy with Max_depth: 11
Accuracy Score: 0.8470948012232415
Accuracy with Max_depth: 12
Accuracy Score: 0.855249745158002
Accuracy with Max_depth: 13
Accuracy Score: 0.8511722731906218
Accuracy with Max_depth: 14
Accuracy Score: 0.8532110091743119
Accuracy with Max_depth: 15
Accuracy Score: 0.8521916411824668
Accuracy with Max_depth: 16
Accuracy Score: 0.8501529051987767
Accuracy with Max_depth: 17
Accuracy Score: 0.8470948012232415
Accuracy with Max_depth: 18
Accuracy Score: 0.8460754332313966
A

In [54]:
for i in range(1,15):
    print("Accuracy with min_sample_leaf:",i)
    fit_predict(X_train,X_test,y_train,y_test,n_estimators=406,max_depth=13,min_samples_leaf=i)

Accuracy with min_sample_leaf: 1
Accuracy Score: 0.8532110091743119
Accuracy with min_sample_leaf: 2
Accuracy Score: 0.8491335372069317
Accuracy with min_sample_leaf: 3
Accuracy Score: 0.8501529051987767
Accuracy with min_sample_leaf: 4
Accuracy Score: 0.8481141692150866
Accuracy with min_sample_leaf: 5
Accuracy Score: 0.8481141692150866
Accuracy with min_sample_leaf: 6
Accuracy Score: 0.8419979612640163
Accuracy with min_sample_leaf: 7
Accuracy Score: 0.8481141692150866
Accuracy with min_sample_leaf: 8
Accuracy Score: 0.836901121304791
Accuracy with min_sample_leaf: 9
Accuracy Score: 0.8430173292558614
Accuracy with min_sample_leaf: 10
Accuracy Score: 0.835881753312946
Accuracy with min_sample_leaf: 11
Accuracy Score: 0.8389398572884812
Accuracy with min_sample_leaf: 12
Accuracy Score: 0.8419979612640163
Accuracy with min_sample_leaf: 13
Accuracy Score: 0.8389398572884812
Accuracy with min_sample_leaf: 14
Accuracy Score: 0.8399592252803262


In [56]:
print("Accuracy with Random Forest:")
rf=RandomForestClassifier(n_estimators=400)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
print(accuracy_score(y_test,y_pred))

Accuracy with Random Forest:
0.8256880733944955


## Grid Search CV

In [57]:
from sklearn.model_selection import GridSearchCV
param={'n_estimators':[500,550,600],'max_depth':[18,20,22],'min_sample_leaf':[2,3,4]}

In [58]:
gscv=GridSearchCV(estimator=RandomForestClassifier(),param_grid=param,verbose=3)

In [59]:
gscv.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] max_depth=18, min_sample_leaf=2, n_estimators=500 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter min_sample_leaf for estimator RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [60]:
rf_400=RandomForestClassifier(n_estimators=400)
rf_400.fit(X_train,y_train)
y_pred=rf_400.predict(X_test)
print("Accuracy Score:",accuracy_score(y_test,y_pred))

Accuracy Score: 0.8277268093781855


In [61]:
rf_500=RandomForestClassifier(n_estimators=500)
rf_500.fit(X_train,y_train)
y_pred=rf_500.predict(X_test)
print("Accuracy Score:",accuracy_score(y_test,y_pred))

Accuracy Score: 0.8307849133537207


In [62]:
rf_600=RandomForestClassifier(n_estimators=500)
rf_600.fit(X_train,y_train)
y_pred=rf_600.predict(X_test)
print("Accuracy Score:",accuracy_score(y_test,y_pred))

Accuracy Score: 0.8307849133537207
