### Import relevant libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

### Read, load, explore and preprocess the data

In [3]:
df = pd.read_csv("income_evaluation.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.shape

(32561, 15)

In [5]:
df.dtypes

age                 int64
 workclass         object
 fnlwgt             int64
 education         object
 education-num      int64
 marital-status    object
 occupation        object
 relationship      object
 race              object
 sex               object
 capital-gain       int64
 capital-loss       int64
 hours-per-week     int64
 native-country    object
 income            object
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
df.isnull().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [9]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [44]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [77]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

### Separate Predictor and Target

In [78]:
y=df[[" income"]]
X= df.drop(" income", axis=1)

In [79]:
print(X.shape)
print(y.shape)

(32561, 14)
(32561, 1)


In [80]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [81]:
y.head()

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K


In [82]:
y.isnull().sum()

 income    0
dtype: int64

In [83]:
y.isna().sum()

 income    0
dtype: int64

One hot encoding for target

In [84]:
#One hot encoder
ohe = OneHotEncoder(drop= "first", sparse = False)

In [85]:
ohe.fit(y)



In [86]:
y_ohe= pd.DataFrame(ohe.transform(y).astype(int))

In [87]:
y_ohe.tail()

Unnamed: 0,0
32556,0
32557,1
32558,0
32559,0
32560,1


Making Predictor dataframe suitable

In [88]:
X.dtypes

age                 int64
 workclass         object
 fnlwgt             int64
 education         object
 education-num      int64
 marital-status    object
 occupation        object
 relationship      object
 race              object
 sex               object
 capital-gain       int64
 capital-loss       int64
 hours-per-week     int64
 native-country    object
dtype: object

In [89]:
X_numeric = X.select_dtypes(include=[np.number])
X_cat = X.select_dtypes(exclude=[np.number])    

In [90]:
num_si = SimpleImputer(strategy='mean')
num_si.fit(X_numeric)

In [91]:
X_imp_num = pd.DataFrame(num_si.transform(X_numeric), columns = X_numeric.columns)
X_imp_num.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39.0,77516.0,13.0,2174.0,0.0,40.0
1,50.0,83311.0,13.0,0.0,0.0,13.0
2,38.0,215646.0,9.0,0.0,0.0,40.0
3,53.0,234721.0,7.0,0.0,0.0,40.0
4,28.0,338409.0,13.0,0.0,0.0,40.0


In [92]:
cat_si = SimpleImputer(strategy='most_frequent')
cat_si.fit(X_cat)

In [93]:
X_imp_cat = pd.DataFrame(cat_si.transform(X_cat), columns = X_cat.columns)
X_imp_cat.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [94]:
scaler = StandardScaler()
scaler.fit(X_imp_num,)

In [95]:
X_sca_imp_num = pd.DataFrame(scaler.transform(X_imp_num), columns = X_imp_num.columns)

In [96]:
X_sca_imp_num.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [108]:
X_imp_cat.columns

Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object')

In [109]:
for col in X_imp_cat.columns:
    X_imp_cat= pd.get_dummies(X_imp_cat,col, drop_first=True)

In [111]:
X_imp_cat.head()

Unnamed: 0,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,workclass_ 11th,workclass_ 12th,...,workclass_ Portugal,workclass_ Puerto-Rico,workclass_ Scotland,workclass_ South,workclass_ Taiwan,workclass_ Thailand,workclass_ Trinadad&Tobago,workclass_ United-States,workclass_ Vietnam,workclass_ Yugoslavia
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
X_final = pd.concat([X_sca_imp_num, X_imp_cat], axis = 1)
X_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 100 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   age                                     32561 non-null  float64
 1    fnlwgt                                 32561 non-null  float64
 2    education-num                          32561 non-null  float64
 3    capital-gain                           32561 non-null  float64
 4    capital-loss                           32561 non-null  float64
 5    hours-per-week                         32561 non-null  float64
 6    workclass_ Federal-gov                 32561 non-null  uint8  
 7    workclass_ Local-gov                   32561 non-null  uint8  
 8    workclass_ Never-worked                32561 non-null  uint8  
 9    workclass_ Private                     32561 non-null  uint8  
 10   workclass_ Self-emp-inc                32561 non-null  u

### Split the data

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_ohe, test_size = 0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26048, 100)
(6513, 100)
(26048, 1)
(6513, 1)


### Lgositic Regression

In [116]:
lr=LogisticRegression()

In [117]:
lr.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [119]:
y_pred=lr.predict(X_test)

In [120]:
prediction=pd.DataFrame()
prediction["y actual"]=y_test
prediction["y_pred"]=y_pred
prediction.head()

Unnamed: 0,y actual,y_pred
4257,1,1
6739,0,1
2342,0,0
12066,1,0
22124,0,0


In [155]:
cr= classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.88      0.93      0.91      4950
           1       0.74      0.59      0.66      1563

    accuracy                           0.85      6513
   macro avg       0.81      0.76      0.78      6513
weighted avg       0.84      0.85      0.85      6513



### Applying grid search

In [134]:
grid = {
    "penalty" : ["elasticnet"],
    "solver": ['saga'],
    "max_iter":[150,200],
    "l1_ratio":[0.1,0.5]
    #Add more parameters as per requirement
}

In [135]:
gcv_lr_model= GridSearchCV(estimator=lr, param_grid=grid,cv=5,scoring = 'accuracy', verbose=1 )

In [136]:
gcv_lr_model

In [137]:
gcv_lr_model.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [138]:
cv_results = pd.DataFrame(gcv_lr_model.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_l1_ratio,param_max_iter,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,11.967356,0.31255,0.013547,0.004057,0.1,150,elasticnet,saga,"{'l1_ratio': 0.1, 'max_iter': 150, 'penalty': ...",0.84952,0.846449,0.852591,0.854867,0.856402,0.851966,0.003603,1
1,16.317725,0.515248,0.011905,0.003216,0.1,200,elasticnet,saga,"{'l1_ratio': 0.1, 'max_iter': 200, 'penalty': ...",0.84952,0.846449,0.852591,0.854675,0.856402,0.851927,0.003573,3
2,11.086548,0.190871,0.008439,0.007428,0.5,150,elasticnet,saga,"{'l1_ratio': 0.5, 'max_iter': 150, 'penalty': ...",0.849712,0.846065,0.852975,0.854675,0.85621,0.851927,0.00364,4
3,15.051773,0.071594,0.017925,0.011598,0.5,200,elasticnet,saga,"{'l1_ratio': 0.5, 'max_iter': 200, 'penalty': ...",0.849712,0.846065,0.852975,0.854867,0.85621,0.851966,0.00367,2


In [139]:
cv_results[cv_results["rank_test_score"]<5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_l1_ratio,param_max_iter,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,11.967356,0.31255,0.013547,0.004057,0.1,150,elasticnet,saga,"{'l1_ratio': 0.1, 'max_iter': 150, 'penalty': ...",0.84952,0.846449,0.852591,0.854867,0.856402,0.851966,0.003603,1
1,16.317725,0.515248,0.011905,0.003216,0.1,200,elasticnet,saga,"{'l1_ratio': 0.1, 'max_iter': 200, 'penalty': ...",0.84952,0.846449,0.852591,0.854675,0.856402,0.851927,0.003573,3
2,11.086548,0.190871,0.008439,0.007428,0.5,150,elasticnet,saga,"{'l1_ratio': 0.5, 'max_iter': 150, 'penalty': ...",0.849712,0.846065,0.852975,0.854675,0.85621,0.851927,0.00364,4
3,15.051773,0.071594,0.017925,0.011598,0.5,200,elasticnet,saga,"{'l1_ratio': 0.5, 'max_iter': 200, 'penalty': ...",0.849712,0.846065,0.852975,0.854867,0.85621,0.851966,0.00367,2


In [140]:
gcv_lr_model.best_estimator_

In [142]:
lr_new= LogisticRegression(penalty="elasticnet", l1_ratio=0.1,max_iter=150, solver="saga")
lr_new.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [144]:
y_pred_new=lr_new.predict(X_test)
prediction_new=pd.DataFrame()
prediction_new["Y_actual"]=y_test
prediction_new["Y_predicted"]=y_pred_new

In [148]:
cm=confusion_matrix(y_test,y_pred_new)
cm

array([[4619,  331],
       [ 638,  925]], dtype=int64)

In [154]:
cr= classification_report(y_test,y_pred_new)
print(cr)

              precision    recall  f1-score   support

           0       0.88      0.93      0.91      4950
           1       0.74      0.59      0.66      1563

    accuracy                           0.85      6513
   macro avg       0.81      0.76      0.78      6513
weighted avg       0.84      0.85      0.85      6513



#### Observations:

From the classification report we see that the accuracy is approx 85%.

The same results were obtained without grid search also.

More tuning to the parameters need to be done for better accuracy

### Random Forest

In [146]:
rf=RandomForestClassifier()

In [164]:
grid={"max_depth":[3,5],
     "bootstrap":[True,False],
     "n_estimators":[10,50]}

In [166]:
gcv_rf_model=GridSearchCV(estimator=rf, param_grid=grid, cv=3, verbose=1)

In [167]:
gcv_rf_model.fit(X_train,y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [168]:
gcv_rf_model.best_estimator_

In [170]:
cv_results= pd.DataFrame(gcv_rf_model.cv_results_)
cv_results[cv_results["rank_test_score"]<5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.200796,0.006752,0.023304,0.005232,True,5,10,"{'bootstrap': True, 'max_depth': 5, 'n_estimat...",0.83508,0.832201,0.82504,0.830774,0.004221,3
3,0.845663,0.011347,0.087753,0.008257,True,5,50,"{'bootstrap': True, 'max_depth': 5, 'n_estimat...",0.834274,0.840263,0.84416,0.839566,0.004066,1
6,0.231057,0.007478,0.022955,0.005818,False,5,10,"{'bootstrap': False, 'max_depth': 5, 'n_estima...",0.837499,0.817344,0.832297,0.829047,0.008543,4
7,0.920963,0.029182,0.085582,0.002509,False,5,50,"{'bootstrap': False, 'max_depth': 5, 'n_estima...",0.836577,0.833353,0.837941,0.835957,0.001924,2


In [172]:
#using the best model parameterd

rf_new=RandomForestClassifier(max_depth=5, n_estimators=50)

In [174]:
rf_new.fit(X_train,y_train)

  rf_new.fit(X_train,y_train)


In [175]:
y_pred=rf_new.predict(X_test)

In [176]:
cr= classification_report(y_test,y_pred)

In [177]:
print(cr)

              precision    recall  f1-score   support

           0       0.84      0.98      0.90      4950
           1       0.85      0.42      0.56      1563

    accuracy                           0.84      6513
   macro avg       0.85      0.70      0.73      6513
weighted avg       0.84      0.84      0.82      6513



#### Observations

Accuracy of 84 % obtained. Further tuning of hyper parameters such as max depth, leaf nodes, n_estimators etc may lead to improvement in accuracy