In [5]:
# import libraries
from numpy import mean
from numpy import std
from numpy import hstack
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

# reading csv files
df =  pd.read_csv('adult.data', sep=",", header=None, skipinitialspace=True)
df2 = pd.read_csv('adult.test', sep=",", header=None, skipinitialspace=True)

# Join the data and test files together
df = pd.concat([df, df2])

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Replace all of ? with None
df = df.replace(['?'], [None])
# Drop all rows with None in them
df = df.dropna(axis=0)

# Check no None values remain
df.isnull().sum()

#Adding column headers to our data 
df.columns = ["Age", "Workclass", "Fnlwgt", "Education", "Education-num", "Marital-status", "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week", "Native-country", "Income"]
# Workclass, Fnlwgt, Race and Native-country are not worth using.
# Education = Education num, so drop Education
df = df.drop(columns=['Workclass', 'Race', 'Fnlwgt', 'Native-country', 'Education'])


In [6]:
#Numerically encoding occupation: Occupaiton is grouped into white collar and blue collar 
occupation_mapping_dict = {
    "Tech-support" : 0,
    "Craft-repair" : 1, 
    "Other-service" : 1, #Wasn't sure about blue or white collar for other services 
    "Sales" : 0, 
    "Exec-managerial" : 0, 
    "Prof-specialty" : 0, 
    "Handlers-cleaners" : 1, 
    "Machine-op-inspct" : 1, 
    "Adm-clerical" : 0, 
    "Farming-fishing" : 1, 
    "Transport-moving" : 1, 
    "Priv-house-serv" : 1, 
    "Protective-serv" : 1, 
    "Armed-Forces" : 1
    }

df["Occupation"] = df["Occupation"].map(occupation_mapping_dict)


#Numerically encoding the sex variable 
sex_mapping_dict = {
    "Male" : 0,
    "Female" : 1
    }

df["Sex"] = df["Sex"].map(sex_mapping_dict)


#Encoding income variable
income_mapping_dict = {
    "<=50K" : 0,
    ">50K" : 1, 
    "<=50K." : 0, 
    ">50K." : 1
    }

df["Income"] = df["Income"].map(income_mapping_dict)


# FOR MODELS
# Group ages into discrete bins for models
bins = [10,20,30,40,50,60,70,80,90]
names = ['0', '1', '2', '3', '4', '5', '6', '7']
df['Age'] = pd.cut(df['Age'], bins, labels = names)



df

Unnamed: 0,Age,Education-num,Marital-status,Occupation,Relationship,Sex,Capital-gain,Capital-loss,Hours-per-week,Income
0,5,13,Married-civ-spouse,0,Husband,0,0,0,50,1
1,2,10,Never-married,1,Own-child,0,0,0,40,0
2,0,6,Never-married,1,Unmarried,0,0,0,30,0
3,1,12,Married-civ-spouse,0,Husband,0,0,0,40,1
4,4,9,Never-married,0,Not-in-family,0,0,0,40,0
...,...,...,...,...,...,...,...,...,...,...
48837,3,13,Never-married,0,Not-in-family,0,0,0,40,0
48838,4,14,Widowed,0,Not-in-family,1,0,0,40,0
48839,4,9,Married-civ-spouse,1,Husband,0,0,0,40,1
48840,2,10,Never-married,0,Unmarried,1,0,0,35,0


In [7]:
#Will now employ one-hot encoding for :  Marital Status and Relationship ; no order in their values
df = pd.get_dummies(df, columns = ['Relationship', 'Marital-status'])

In [8]:
# Split X and y
X = df.iloc[:, [0,1,2, 3, 4, 5, 6,8,9,10,11,12,13,14,15,16,17,18,19,20]]
y = df.iloc[:, [7]]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.20)

In [9]:
#Importing necessary modules
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
#Defining our model 
model = LogisticRegression()

#Initialising scaler
scaler = StandardScaler()

#For logistic regresion, need to scale our data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

#Need to transform our y data as
y_train = y_train.ravel()
Y_test = y_test.ravel()

#Define all of our hyperparameters 
solvers = ['newton-cg', 'liblinear', 'lbfgs','newton-cholesky', 'sag', 'saga']
penalty = ['l1', 'l2', 'elasticnet']
c_values = [1000, 10, 1, 0.1, 0.001, 0.001]

#Defining our search space 
space = dict(solver = solvers, penalty = penalty, C = c_values)

#Defining our cross validation
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

#Initialising our grid search
grid_search = GridSearchCV(estimator = model, param_grid = space, n_jobs = -1,cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_scaled, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

1980 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

---------------------

Best: 0.843335 using {'C': 1000, 'penalty': 'l2', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 1000, 'penalty': 'l1', 'solver': 'newton-cg'}
0.843326 (0.006208) with: {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 1000, 'penalty': 'l1', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 1000, 'penalty': 'l1', 'solver': 'newton-cholesky'}
0.000000 (0.000000) with: {'C': 1000, 'penalty': 'l1', 'solver': 'sag'}
0.843317 (0.006202) with: {'C': 1000, 'penalty': 'l1', 'solver': 'saga'}
0.843335 (0.006204) with: {'C': 1000, 'penalty': 'l2', 'solver': 'newton-cg'}
0.843335 (0.006204) with: {'C': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
0.843335 (0.006208) with: {'C': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 1000, 'penalty': 'l2', 'solver': 'newton-cholesky'}
0.843335 (0.006204) with: {'C': 1000, 'penalty': 'l2', 'solver': 'sag'}
0.843317 (0.006202) with: {'C': 1000, 'penalty': 'l2', 'solver': 'saga'}
0.000000 

In [10]:
#Testing results for optimal hyperparameter combination, with scaled vs unscaled data 

model = LogisticRegression(solver = 'saga', penalty='l1', C = 0.1)

model.fit(X_train_scaled, y_train)

train_acc_scaled = model.score(X_train_scaled, y_train)

print('Accuracy for scaled date: ', train_acc_scaled)

model.fit(X_train, y_train)

train_acc = model.score(X_train, y_train)

print('Accuracy for unscaled data: ', train_acc)

Accuracy for scaled date:  0.8435746468750864
Accuracy for unscaled data:  0.7791967272023661


  X = check_array(X, **check_params)


In [11]:
#Importing necessary libraries 
from sklearn.decomposition import PCA

#Want to use datasets of different dimensions 

model = LogisticRegression(solver = 'saga', penalty='l1', C = 0.1)

model.fit(X_train_scaled, y_train)

original_acc = model.score(X_train_scaled, y_train)

print(X_train_scaled.shape)

#From above, we get that dimension of X_train_scaled is 36177x20

#Want to use PCA to reduce to 15, 10 and 5 and see if that has any effect 
pca = PCA(n_components=15)
X_train_15 = pca.fit_transform(X_train_scaled)
model.fit(X_train_15, y_train)
acc_15 = model.score(X_train_15, y_train)

pca = PCA(n_components=10)
X_train_10 = pca.fit_transform(X_train_scaled)
model.fit(X_train_10, y_train)
acc_10 = model.score(X_train_10, y_train)

pca = PCA(n_components=5)
X_train_5 = pca.fit_transform(X_train_scaled)
model.fit(X_train_5, y_train)
acc_5 = model.score(X_train_5, y_train)


print('Original accuracy: ', original_acc)
print('Accuracy for data reduced to 15 dimensions: ', acc_15)
print('Accuracy for data reduced to 10 dimensions: ', acc_10)
print('Accuracy for data reduced to 5 dimensions: ', acc_5)

(36177, 20)
Original accuracy:  0.8435746468750864
Accuracy for data reduced to 15 dimensions:  0.8432982281560107
Accuracy for data reduced to 10 dimensions:  0.8318821350581861
Accuracy for data reduced to 5 dimensions:  0.83138458136385


In [None]:
#From above, can see that the dimensionality reduction doesn't have much of an effect on overall accuracy
#From results above, can see that errors are generated for many of the cases. This means that for that particular 'solver', convergence isn't acheived
#Optimal set of hyperparameters is generated 