In [1]:
import pandas as pd
import numpy as np
# Data Loading
dataset = pd.read_csv('adult.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [2]:
# Data Pre-processing
dataset = dataset.fillna(np.nan)
# Drop the data I don't want to use
dataset.drop(labels=["workclass","fnlwgt", "education","occupation","relationship","race","native.country"], axis = 1, inplace = True)
# Reformat Column We Are Predicting: 0 means less than 50K. 1 means greater than 50K.
dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
# Convert Sex value to 0 and 1
dataset["sex"] = dataset["sex"].map({"Male": 0, "Female":1})
# Create Married Column - Binary Yes(1) or No(0)
dataset["marital.status"] = dataset["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
dataset["marital.status"] = dataset["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
dataset["marital.status"] = dataset["marital.status"].map({"Married":1, "Single":0})
dataset["marital.status"] = dataset["marital.status"].astype(int)
array = dataset.values
X = array[:,0:7]
Y = array[:,7]
print(X.shape)
print(Y.shape)

(32561, 7)
(32561,)


In [5]:
# Data Splitting
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X,Y,train_size=0.7,random_state=2021,stratify=Y)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(22792, 7)
(22792,)
(9769, 7)
(9769,)


In [7]:
# logistic regression
# D1F
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=2021).fit(train_x,train_y)
print('prediction class')
print(lr.predict([test_x[2021]]))
print('prediciton probability')
print(lr.predict_proba([test_x[2021]]))

prediction class
[0]
prediciton probability
[[0.75335648 0.24664352]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# D2F
print("The training accuracy is ", lr.score(train_x, train_y))
print("The test accuracy is ", lr.score(test_x, test_y))

The training accuracy is  0.8216479466479466
The test accuracy is  0.8199406285187839


In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score
train_pred = lr.predict(train_x)
print("The precision on train set is ", precision_score(train_y, train_pred))
print("The recall on train set is ", recall_score(train_y, train_pred))
print("The f1 score on train set is ", f1_score(train_y, train_pred))
test_pred = lr.predict(test_x)
print("The precision on test set is ", precision_score(test_y, test_pred))
print("The recall on test set is ", recall_score(test_y, test_pred))
print("The f1 score on test set is ", f1_score(test_y, test_pred))

The precision on train set is  0.7348284960422163
The recall on train set is  0.40590271451994897
The f1 score on train set is  0.5229433165121464
The precision on test set is  0.7329143754909663
The recall on test set is  0.39668367346938777
The f1 score on test set is  0.5147586206896552


In [12]:
# Support Vector Machine
# D1F
from sklearn.svm import SVC
svm = SVC(random_state=2021,probability=True).fit(train_x,train_y)
print('prediction class')
print(svm.predict([test_x[2021]]))
print('prediciton probability')
print(svm.predict_proba([test_x[2021]]))

prediction class
[0]
prediciton probability
[[0.81288558 0.18711442]]


In [13]:
# D2F
print("The training accuracy is ", svm.score(train_x, train_y))
print("The test accuracy is ", svm.score(test_x, test_y))

The training accuracy is  0.8016409266409267
The test accuracy is  0.8039717473641109


In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score
train_pred = svm.predict(train_x)
print("The precision on train set is ", precision_score(train_y, train_pred))
print("The recall on train set is ", recall_score(train_y, train_pred))
print("The f1 score on train set is ", f1_score(train_y, train_pred))
test_pred = svm.predict(test_x)
print("The precision on test set is ", precision_score(test_y, test_pred))
print("The recall on test set is ", recall_score(test_y, test_pred))
print("The f1 score on test set is ", f1_score(test_y, test_pred))

The precision on train set is  0.7333654773384763
The recall on train set is  0.2770996538531609
The f1 score on train set is  0.40222134073780247
The precision on test set is  0.7567567567567568
The recall on test set is  0.27380952380952384
The f1 score on test set is  0.40212300967842646
