In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score

In [2]:
df = pd.read_csv("adult.csv")

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df = df.replace(" ?",np.nan)

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df.workclass = imputer.fit_transform(df['workclass'].values.reshape(-1,1))[:,0]
df.occupation = imputer.fit_transform(df['occupation'].values.reshape(-1,1))[:,0]
df.country = imputer.fit_transform(df['country'].values.reshape(-1,1))[:,0]
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df["salary"] = df["salary"].replace({" <=50K":0," >50K":1})

In [7]:
df = df.drop(["fnlwgt"],axis=1)
df = df.drop(["education"],axis=1)

In [9]:
X = df.drop(columns=['salary'],axis=1)
y = df['salary']

In [10]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohe_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [11]:
X = preprocessor.fit_transform(X)

In [12]:
X.shape

(32561, 88)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((26048, 88), (6513, 88))

In [23]:
models = {
    "Logistic Regression":LogisticRegression(max_iter=300),
    "DecisionTree Classifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "SVC":SVC()
    
}
model_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    accuracy_train_data = accuracy_score(y_train,y_train_pred)
    accuracy_test_data = accuracy_score(y_test,y_test_pred)
    
    confusion_matrix_train = confusion_matrix(y_train,y_train_pred)
    confusion_matrix_test = confusion_matrix(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model performance for Training set')
    print("Accuracy: {}".format(accuracy_train_data))
    print("ConfusionMatrix: {}".format(confusion_matrix_train))
    
    print('----------------------------------')
    
    print('Model performance for Training set')
    print("Accuracy: {}".format(accuracy_test_data))
    print("ConfusionMatrix: {}".format(confusion_matrix_test))
    
    print('='*35)
    print('\n')
    

Logistic Regression
Model performance for Training set
Accuracy: 0.8511977886977887
ConfusionMatrix: [[18418  1360]
 [ 2516  3754]]
----------------------------------
Model performance for Training set
Accuracy: 0.8558268079226163
ConfusionMatrix: [[4610  332]
 [ 607  964]]


DecisionTree Classifier
Model performance for Training set
Accuracy: 0.9785780098280098
ConfusionMatrix: [[19702    76]
 [  482  5788]]
----------------------------------
Model performance for Training set
Accuracy: 0.8231229847996315
ConfusionMatrix: [[4359  583]
 [ 569 1002]]


RandomForestClassifier
Model performance for Training set
Accuracy: 0.9785780098280098
ConfusionMatrix: [[19584   194]
 [  364  5906]]
----------------------------------
Model performance for Training set
Accuracy: 0.8524489482573315
ConfusionMatrix: [[4559  383]
 [ 578  993]]


AdaBoostClassifier
Model performance for Training set
Accuracy: 0.859490171990172
ConfusionMatrix: [[18573  1205]
 [ 2455  3815]]
--------------------------------