In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import pickle

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

In [3]:
name = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [4]:
dataset = pd.read_csv(url, names = name)

In [5]:
dataset

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [6]:
X = dataset[['age', 'education-num']]
y = dataset['income']

# Split data

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
model_lgr = LogisticRegression(C=0.01, n_jobs = -1)
print(model_lgr)

LogisticRegression(C=0.01, n_jobs=-1)


In [9]:
model_lgr.fit(X_train,y_train)

LogisticRegression(C=0.01, n_jobs=-1)

In [10]:
model_lgr.coef_

array([[0.04370922, 0.3586883 ]])

In [11]:
model_lgr.intercept_

array([-6.71569476])

In [12]:
ypred_train = model_lgr.predict(X_train)
print(ypred_train[:10])
ypred_train_proba = model_lgr.predict_proba(X_train)
print(ypred_train_proba[:10])

ypred_test = model_lgr.predict(X_test)
print(ypred_test[:10])
ypred_proba_test = model_lgr.predict_proba(X_test)
print(ypred_proba_test[:5,:])

[' <=50K' ' <=50K' ' <=50K' ' >50K' ' >50K' ' <=50K' ' <=50K' ' <=50K'
 ' <=50K' ' <=50K']
[[0.79343368 0.20656632]
 [0.90948404 0.09051596]
 [0.90734763 0.09265237]
 [0.37142768 0.62857232]
 [0.42371266 0.57628734]
 [0.72850685 0.27149315]
 [0.8449359  0.1550641 ]
 [0.96429125 0.03570875]
 [0.98065821 0.01934179]
 [0.89316634 0.10683366]]
[' <=50K' ' <=50K' ' <=50K' ' <=50K' ' <=50K' ' >50K' ' <=50K' ' <=50K'
 ' <=50K' ' <=50K']
[[0.87530077 0.12469923]
 [0.82062465 0.17937535]
 [0.68679536 0.31320464]
 [0.6773173  0.3226827 ]
 [0.86544194 0.13455806]]


In [13]:
# Accuracy on train
print(accuracy_score(y_train, ypred_train))
# Accuracy on test
print(accuracy_score(y_test, ypred_test))

0.7819000819000819
0.7822134872865741


In [14]:
print(model_lgr.score(X_train, y_train))
print(model_lgr.score(X_test, y_test))

0.7819000819000819
0.7822134872865741


In [15]:
print(confusion_matrix(y_test, ypred_test))

[[5836  378]
 [1395  532]]


In [16]:
target_names = ['smaller_that_50', 'more_that_50']
print(classification_report(y_test, ypred_test, target_names=target_names))

                 precision    recall  f1-score   support

smaller_that_50       0.81      0.94      0.87      6214
   more_that_50       0.58      0.28      0.38      1927

       accuracy                           0.78      8141
      macro avg       0.70      0.61      0.62      8141
   weighted avg       0.75      0.78      0.75      8141



# Hyperparameters Grid Search in LR model

In [17]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [.001, .01, 1, 10]}
grid = GridSearchCV(model_lgr, param_grid, cv=5, n_jobs=-1)

In [18]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(C=0.01, n_jobs=-1), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 1, 10]})

In [19]:
grid.best_score_

0.7817362817362816

In [20]:
grid.best_params_

{'C': 0.01}

In [21]:
grid_model = grid.best_estimator_
print(grid_model)

LogisticRegression(C=0.01, n_jobs=-1)


In [22]:
print(grid_model.score(X_test,y_test))
print(grid_model.score(X_train,y_train))

0.7822134872865741
0.7819000819000819


# K-fold Cross-Validation

In [23]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lgr, X_train, y_train, cv=5)
scores

array([0.77886978, 0.78316953, 0.78112203, 0.77846028, 0.78705979])

In [24]:
print("Mean cv accuracy: %0.4f" % scores.mean())

Mean cv accuracy: 0.7817


# Save and Load LGR model

In [25]:
Pkl_Filename = "lgr_model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model_lgr, file)

In [26]:
with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)
Pickled_LR_Model

LogisticRegression(C=0.01, n_jobs=-1)