In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv('F:\DATA SCIENCE\Projects\Adult Census project\\notebooks\Data\\adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df=df.drop(labels=['race','occupation','education','country','relationship'],axis=1)

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,sex,capital-gain,capital-loss,hours-per-week,salary
0,39,State-gov,77516,13,Never-married,Male,2174,0,40,<=50K
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Male,0,0,13,<=50K
2,38,Private,215646,9,Divorced,Male,0,0,40,<=50K
3,53,Private,234721,7,Married-civ-spouse,Male,0,0,40,<=50K
4,28,Private,338409,13,Married-civ-spouse,Female,0,0,40,<=50K


In [4]:
df['salary'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [5]:
categorical_columns=['workclass', 'marital-status','sex','salary']
for feature in categorical_columns:
    df[feature]=df[feature].str.replace(" ","")
for col in ['workclass']:
    df[col].replace('?',df[col].mode()[0],inplace=True)

def workclasses(data):
    if data['workclass'] in ['Federal-gov','Local-gov','State-gov']:
        return 'govt'
    elif data['workclass'] in ['Self-emp-not-inc','Self-emp-inc']:
        return 'self_employed'
    elif data['workclass']=='Private':
        return 'Private'
    else:
        return 'without_pay'
    
def mar_status(status):
    unmarried=['Never-married','Divorced','Separated',
       'Widowed']
    if status in unmarried:
        return 'unmarried'
    else :
        return 'married'
    
df['marital-status']=df['marital-status'].apply(mar_status)

df['employment_info']=df.apply(workclasses,axis=1)

df['salary']=df['salary'].map({'<=50K':0,'>50K':1})

## Independant and dependant features


In [6]:
X=df.drop(labels=['salary','workclass'],axis=1)
Y=df[['salary']]
X.head(25)

Unnamed: 0,age,fnlwgt,education-num,marital-status,sex,capital-gain,capital-loss,hours-per-week,employment_info
0,39,77516,13,unmarried,Male,2174,0,40,govt
1,50,83311,13,married,Male,0,0,13,self_employed
2,38,215646,9,unmarried,Male,0,0,40,Private
3,53,234721,7,married,Male,0,0,40,Private
4,28,338409,13,married,Female,0,0,40,Private
5,37,284582,14,married,Female,0,0,40,Private
6,49,160187,5,married,Female,0,0,16,Private
7,52,209642,9,married,Male,0,0,45,self_employed
8,31,45781,14,unmarried,Female,14084,0,50,Private
9,42,159449,13,married,Male,5178,0,40,Private


Seperating the numerical and categorical columns for encoding and standardisation

In [7]:
categorical_cols=X.select_dtypes(include='object').columns
numerical_columns=X.select_dtypes(exclude='object').columns

In [8]:
print(categorical_cols)
print(numerical_columns)

Index(['marital-status', 'sex', 'employment_info'], dtype='object')
Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
for features in categorical_cols:
    X[features]=encoder.fit_transform(X[features])

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
marital_status_map=['unmarried','married']
sex_map=['Female','Male']
employement_map=['without_pay','Private','govt','self_employed']

In [11]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OrdinalEncoder(categories=[marital_status_map,sex_map,employement_map])),
        ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',cat_pipeline,categorical_cols)
]
)


In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.33,random_state=42)

In [13]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__fnlwgt,num_pipeline__education-num,num_pipeline__capital-gain,num_pipeline__capital-loss,num_pipeline__hours-per-week,cat_pipeline__marital-status,cat_pipeline__sex,cat_pipeline__employment_info
0,0.91247,-0.627549,1.515545,-0.146699,-0.221712,0.780288,1.051617,0.702032,-0.530902
1,0.546034,-0.390675,-0.422711,-0.146699,-0.221712,-0.033203,1.051617,0.702032,-0.530902
2,-0.992997,0.733596,1.127894,-0.146699,-0.221712,1.187034,-0.950917,-1.424437,-0.530902
3,-0.846422,-1.443666,0.740243,-0.146699,-0.221712,-0.033203,-0.950917,0.702032,-0.530902
4,0.619321,-0.712335,1.515545,0.941566,-0.221712,0.373542,1.051617,0.702032,0.945028


In [16]:
y_train.head()

Unnamed: 0,salary
27585,1
18010,0
22472,0
4585,0
11191,1


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score


In [20]:
def evaluate_model(true,predicted):
    conf_matrix=confusion_matrix(true,predicted)
    f1=f1_score(true,predicted)
    accuracy=accuracy_score(true,predicted)
    return conf_matrix,f1,accuracy


In [23]:
models={
    'LogisticRegression':LogisticRegression(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForest':RandomForestClassifier(),
    'AdaBoost':AdaBoostClassifier(),
    'GradientBoost':GradientBoostingClassifier(),
    'KNN':KNeighborsClassifier(),
    'SVMClassifier':SVC()
}
trained_model_list=[]
model_list=[]
accuracy_list=[]

for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    # make predictions
    y_pred=model.predict(X_test)
    conf_mat,f1,acc=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print('confusion matrix: ',conf_mat)
    print('Accuracy: ',acc)
    print('f1-score: ',f1)

    accuracy_list.append(acc)
    print('='*35)
    print('\n')



  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


LogisticRegression
Model Training Performance
confusion matrix:  [[7634  562]
 [1161 1389]]
Accuracy:  0.8396612693095105
f1-score:  0.6171961786269717


DecisionTreeClassifier
Model Training Performance
confusion matrix:  [[7091 1105]
 [1014 1536]]
Accuracy:  0.8028103480364787
f1-score:  0.591793488730495


RandomForest
Model Training Performance
confusion matrix:  [[7547  649]
 [1018 1532]]
Accuracy:  0.8448725107016565
f1-score:  0.6476432043965336




  y = column_or_1d(y, warn=True)


AdaBoost
Model Training Performance
confusion matrix:  [[7667  529]
 [1023 1527]]
Accuracy:  0.8555741671319561
f1-score:  0.6630481980026054




  y = column_or_1d(y, warn=True)


GradientBoost
Model Training Performance
confusion matrix:  [[7809  387]
 [1077 1473]]
Accuracy:  0.8637632607481853
f1-score:  0.6680272108843537




  return self._fit(X, y)


KNN
Model Training Performance
confusion matrix:  [[7457  739]
 [1113 1437]]
Accuracy:  0.8276568025311744
f1-score:  0.6081252644942869




  y = column_or_1d(y, warn=True)


SVMClassifier
Model Training Performance
confusion matrix:  [[7754  442]
 [1176 1374]]
Accuracy:  0.8494323469197841
f1-score:  0.6294090700870363




In [24]:
accuracy_list

[0.8396612693095105,
 0.8028103480364787,
 0.8448725107016565,
 0.8555741671319561,
 0.8637632607481853,
 0.8276568025311744,
 0.8494323469197841]