# Creating a classification model to predict whether a person makes over $50k a year



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
df.shape

(32560, 15)


# Renaming coloumns

In [5]:
df.rename(columns={'39':'age', ' State-gov':'workclass', ' 77516':'fnlwgt', ' Bachelors':'education', ' 13':'education_num', ' Never-married':'marital_status',
       ' Adm-clerical':'occupation', ' Not-in-family':'relationship', ' White':'race', ' Male':'sex', ' 2174':'capital_gain', ' 0':'capital_loss',
       ' 40':'hours_per_week', ' United-States':'native_country',' <=50K':'income',},inplace=True)

In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


# Remove handle null values (if any)

In [8]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [9]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [10]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32560.0,32560.0,32560.0,32560.0,32560.0,32560.0
mean,38.581634,189781.8,10.08059,1077.615172,87.306511,40.437469
std,13.640642,105549.8,2.572709,7385.402999,402.966116,12.347618
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117831.5,9.0,0.0,0.0,40.0
50%,37.0,178363.0,10.0,0.0,0.0,40.0
75%,48.0,237054.5,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


# Dropping Education- Education No. is enough, Final Weight- Highly Discrete Data so not useful

In [11]:
df = df.drop(['education', 'fnlwgt'], axis = 1)
df.head(1)

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [12]:
cat_cols = [i for i in df.columns if df[i].dtypes=='object']
print(cat_cols)

['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']


# label encoding

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
lb = LabelEncoder()
for i in cat_cols:
    df[i] = lb.fit_transform(df[i])

In [16]:
df.dtypes

age               int64
workclass         int32
education_num     int64
marital_status    int32
occupation        int32
relationship      int32
race              int32
sex               int32
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native_country    int32
income            int32
dtype: object

In [17]:
df.head()

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,6,13,2,4,0,4,1,0,0,13,39,0
1,38,4,9,0,6,1,4,1,0,0,40,39,0
2,53,4,7,2,6,0,2,1,0,0,40,39,0
3,28,4,13,2,10,5,2,0,0,0,40,5,0
4,37,4,14,2,4,5,4,0,0,0,40,39,0


# Splitting data into training and test data.

In [18]:
x = df.drop(['income'], axis = 1)
y = df['income']

# Applying the following models on the training dataset and generating the predicted value for the test dataset
a. Decision Tree
b. Random Forest Classifier
c. Logistic Regression
d. KNN Classifier
e. SVC Classifier (with linear kernel)

# Calculating  Confusion matrix and classification report

# Precision, Recall, F1-score and Accuracy for each model based on values from confusion_matrix and classification_report


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2) 

In [20]:
def gen_cls_metrics(ytest,ypred):
    print("Accuracy score",accuracy_score(ytest,ypred))
    cm=confusion_matrix(ytest,ypred)
    print('Confusion Matrix:')
    print(cm)
    print('classsification Report:')
    print(classification_report(ytest,ypred))
   
def train_test_score(model):
    print('Training score',model.score(x_train,y_train))
    print('Testing score',model.score(x_test,y_test))

# a.Decision Tree classifier

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [23]:
print(x_train.shape)
print(y_train.shape)

(26048, 12)
(26048,)


In [24]:
m1=DecisionTreeClassifier(criterion='gini',max_depth=10)
m1.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=10)

In [25]:
train_test_score(m1)

Training score 0.8710457616707616
Testing score 0.8542690417690417


In [26]:
ypred_m1=m1.predict(x_test)
print(ypred_m1)

[0 0 0 ... 1 0 0]


In [27]:
print('Metrics for DecisionTreeClasssifier:')
gen_cls_metrics(y_test,ypred_m1)

Metrics for DecisionTreeClasssifier:
Accuracy score 0.8542690417690417
Confusion Matrix:
[[4666  270]
 [ 679  897]]
classsification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4936
           1       0.77      0.57      0.65      1576

    accuracy                           0.85      6512
   macro avg       0.82      0.76      0.78      6512
weighted avg       0.85      0.85      0.85      6512



# b)Random Forest Classsifier

In [21]:
m2 = RandomForestClassifier(n_estimators=70,criterion='gini',max_depth=10)
m2.fit(x_train,y_train)

RandomForestClassifier(max_depth=10, n_estimators=70)

In [22]:
train_test_score(m2)

Training score 0.8701627764127764
Testing score 0.860411547911548


In [23]:
ypredm2=m2.predict(x_test)
print(ypredm2)

[0 0 0 ... 0 0 0]


In [24]:
print('Random Forest Classsifier Metrics:')
gen_cls_metrics(y_test,ypredm2)

Random Forest Classsifier Metrics:
Accuracy score 0.860411547911548
Confusion Matrix:
[[4695  241]
 [ 668  908]]
classsification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      4936
           1       0.79      0.58      0.67      1576

    accuracy                           0.86      6512
   macro avg       0.83      0.76      0.79      6512
weighted avg       0.85      0.86      0.85      6512



# c) Logistic Regfression

In [25]:
m4=LogisticRegression(max_iter=5000)
m4.fit(x_train,y_train)

LogisticRegression(max_iter=5000)

In [26]:
train_test_score(m4)

Training score 0.8245546683046683
Testing score 0.8283169533169533


In [27]:
ypredm4=m4.predict(x_test)
print(ypredm4)

[0 0 0 ... 0 0 0]


In [28]:
print("Logistic Regresssion metrics:")
gen_cls_metrics(y_test,ypredm4)

Logistic Regresssion metrics:
Accuracy score 0.8283169533169533
Confusion Matrix:
[[4633  303]
 [ 815  761]]
classsification Report:
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      4936
           1       0.72      0.48      0.58      1576

    accuracy                           0.83      6512
   macro avg       0.78      0.71      0.73      6512
weighted avg       0.82      0.83      0.82      6512



# d)KNN

In [29]:
m3=KNeighborsClassifier(n_neighbors=35)
m3.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=35)

In [30]:
train_test_score(m3)

Training score 0.8532708845208845
Testing score 0.8433660933660934


In [31]:
ypredm3=m3.predict(x_test)
print(ypredm3)

[0 0 0 ... 0 0 0]


In [32]:
print("KNN classifier Metrics:")
gen_cls_metrics(y_test,ypredm3)

KNN classifier Metrics:
Accuracy score 0.8433660933660934
Confusion Matrix:
[[4577  359]
 [ 661  915]]
classsification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4936
           1       0.72      0.58      0.64      1576

    accuracy                           0.84      6512
   macro avg       0.80      0.75      0.77      6512
weighted avg       0.84      0.84      0.84      6512



# e)SVM classifier

In [None]:
m5 = SVC(kernel='linear',C=0.1)
m5.fit(x_train,y_train)

In [None]:
train_test_score(m5)

In [None]:
ypredm5=m5.predict(X_test)
print(ypredm5)

In [None]:
print('SVM classsifier metrics:')
gen_cls_metrics(y_test,ypredm5)

# CONCLUSION

Accuracy in different models is as follows: 
    Decision Tree Classsifier = 0.851965601965602
    Random Forest Classsifier = 0.8570331695331695
    Logistic Regression =0.8267813267813268
    KNN classsifier =0.8433660933660934
    SVM Classifier =not executed

# From the above accuracy scores , we can say that " Random Forest Classsifier " model is best for classsification.
