In [116]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [117]:
df=pd.read_csv("Data/census_data.csv")
df.head()

Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,makes over
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [118]:
def whitespace_remover(dataframe):
   
    # iterating over the columns
    for i in dataframe.columns:
         
        # checking datatype of each columns
        if dataframe[i].dtype == 'object':
             
            # applying strip function on column
            dataframe[i] = dataframe[i].map(str.strip)
        else:
             
            # if condition is False then it will do nothing.
            pass
 
# applying whitespace_remover function on dataframe
whitespace_remover(df)

In [119]:
df.duplicated().sum()

24

In [120]:
df[df.duplicated()]

Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,makes over
4880,25,Private,308144,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Mexico,<=50K
5103,90,Private,52386,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0,0,35,United-States,<=50K
9170,21,Private,250051,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,10,United-States,<=50K
11630,20,Private,107658,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,10,United-States,<=50K
13083,25,Private,195994,1st-4th,2,Never-married,Priv-house-serv,Not-in-family,White,Female,0,0,40,Guatemala,<=50K
15058,21,Private,243368,Preschool,1,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,50,Mexico,<=50K
17039,46,Private,173243,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
18554,30,Private,144593,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,?,<=50K
18697,19,Private,97261,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,40,United-States,<=50K
21317,19,Private,138153,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,10,United-States,<=50K


In [121]:
df.drop_duplicates(keep='first',inplace=True)

In [122]:
df.shape


(32536, 15)

In [123]:
for col in df.columns:
        print(col,df[col].unique())

Age [50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 39 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]
workclass ['Self-emp-not-inc' 'Private' 'State-gov' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
fnlwgt [ 83311 215646 234721 ...  34066  84661 257302]
education ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
education-num [13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]
marital-status ['Married-civ-spouse' 'Divorced' 'Married-spouse-absent' 'Never-married'
 'Separated' 'Married-AF-spouse' 'Widowed']
occupation ['Exec-managerial' 'Handlers-cleaners' 'Prof-specialty' 'Other-service'
 'Adm-clerical' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-

In [124]:
df["native-country"].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [125]:
makes_over_map = {'<=50K':0,'>50K':1}

df['makes over'] = df['makes over'].map(makes_over_map)

In [126]:
df.replace('?',np.nan,inplace=True)

In [127]:
df.drop(labels=['fnlwgt','capital-loss'],axis=1,inplace=True)

In [128]:
df

Unnamed: 0,Age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,native-country,makes over
0,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,United-States,0
1,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,United-States,0
2,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,United-States,0
3,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,Cuba,0
4,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,40,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,38,United-States,0
32556,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,40,United-States,1
32557,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,40,United-States,0
32558,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,20,United-States,0


In [129]:
#Independent and dependent features
X = df.drop(labels=['makes over'],axis=1)
y= df[['makes over']]

In [130]:
y

Unnamed: 0,makes over
0,0
1,0
2,0
3,0
4,0
...,...
32555,0
32556,1
32557,0
32558,0


In [131]:
X

Unnamed: 0,Age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,native-country
0,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,13,United-States
1,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,40,United-States
2,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,40,United-States
3,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,40,Cuba
4,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,38,United-States
32556,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,40,United-States
32557,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,40,United-States
32558,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,20,United-States


In [132]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [133]:
categorical_cols

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

In [134]:
numerical_cols

Index(['Age', 'education-num', 'capital-gain', 'hours-per-week'], dtype='object')

In [135]:
num_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

## Categorical Pipeline

cat_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(sparse_output=False)),
    ('scaler',StandardScaler())
    ]
)

preprossor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])



In [136]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [137]:
X_train = pd.DataFrame(preprossor.fit_transform(X_train),columns=preprossor.get_feature_names_out())

In [138]:
X_test = pd.DataFrame(preprossor.transform(X_test),columns=preprossor.get_feature_names_out())

In [139]:
X_train.shape

(22775, 103)

In [140]:
X_train.head()

Unnamed: 0,num_pipeline__Age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,cat_pipeline__workclass_Self-emp-not-inc,...,cat_pipeline__native-country_Portugal,cat_pipeline__native-country_Puerto-Rico,cat_pipeline__native-country_Scotland,cat_pipeline__native-country_South,cat_pipeline__native-country_Taiwan,cat_pipeline__native-country_Thailand,cat_pipeline__native-country_Trinadad&Tobago,cat_pipeline__native-country_United-States,cat_pipeline__native-country_Vietnam,cat_pipeline__native-country_Yugoslavia
0,-1.365004,-0.031996,-0.146308,-0.043786,-0.17583,-0.265718,-0.014818,0.576691,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959
1,-0.412388,-0.421682,-0.146308,0.036968,-0.17583,-0.265718,-0.014818,0.576691,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,-3.276224,-0.047374,-0.020959
2,-0.558944,1.137062,-0.146308,-0.043786,-0.17583,-0.265718,-0.014818,0.576691,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959
3,-1.365004,-1.201054,-0.06707,-1.658869,-0.17583,-0.265718,-0.014818,0.576691,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959
4,1.785956,-0.421682,0.143698,-0.043786,-0.17583,-0.265718,-0.014818,0.576691,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959


In [141]:
X_test.head()

Unnamed: 0,num_pipeline__Age,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__hours-per-week,cat_pipeline__workclass_Federal-gov,cat_pipeline__workclass_Local-gov,cat_pipeline__workclass_Never-worked,cat_pipeline__workclass_Private,cat_pipeline__workclass_Self-emp-inc,cat_pipeline__workclass_Self-emp-not-inc,...,cat_pipeline__native-country_Portugal,cat_pipeline__native-country_Puerto-Rico,cat_pipeline__native-country_Scotland,cat_pipeline__native-country_South,cat_pipeline__native-country_Taiwan,cat_pipeline__native-country_Thailand,cat_pipeline__native-country_Trinadad&Tobago,cat_pipeline__native-country_United-States,cat_pipeline__native-country_Vietnam,cat_pipeline__native-country_Yugoslavia
0,-0.998613,-1.980426,-0.146308,0.359984,-0.17583,-0.265718,-0.014818,0.576691,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959
1,1.126453,-0.421682,-0.146308,-0.043786,5.687313,-0.265718,-0.014818,-1.73403,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959
2,0.46695,-1.980426,-0.146308,-0.043786,-0.17583,3.763392,-0.014818,-1.73403,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959
3,-0.558944,-0.031996,-0.146308,-0.043786,-0.17583,-0.265718,-0.014818,0.576691,-0.190182,-0.291174,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959
4,2.738572,1.526748,-0.146308,-0.851328,-0.17583,-0.265718,-0.014818,-1.73403,-0.190182,3.434377,...,-0.033807,-0.055921,-0.020959,-0.045957,-0.039789,-0.024801,-0.025672,0.305229,-0.047374,-0.020959


In [142]:
parameter = {'penalty':['l1', 'l2', 'elasticnet', 'None'],'C':[1,2,0.1,0.001,0.05,0.5],'max_iter':[100,200,300]}

In [143]:
classifier = LogisticRegression()

In [144]:

from sklearn.model_selection import GridSearchCV
classifier_regressor=GridSearchCV(classifier,param_grid=parameter,scoring='accuracy',cv=5)

In [145]:
classifier.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [146]:
classifier_regressor.fit(X_train,y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [147]:
print(classifier_regressor.best_params_)

{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}


In [148]:
print(classifier_regressor.best_score_)

0.8492206366630077


In [149]:
y_pred = classifier_regressor.predict(X_test)

In [150]:
accuracy_score(y_test,y_pred)

0.8487859850425161

In [151]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      7455
           1       0.72      0.59      0.65      2306

    accuracy                           0.85      9761
   macro avg       0.80      0.76      0.78      9761
weighted avg       0.84      0.85      0.84      9761



In [152]:
confusion_matrix(y_test,y_pred)

array([[6923,  532],
       [ 944, 1362]], dtype=int64)