In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler

In [2]:
All_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", names = ['age','workclass','fnlwgt','education','education.num','marital.status','occupation','relationship','race','sex','capital.gain','capital.loss','hours.per.week','native.country','income'],skipinitialspace=True,na_values='?')


All_data = All_data.replace(np.NaN,"?")
All_data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
All_data

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [4]:
All_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
All_data.corr()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768
education.num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123
capital.gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409
capital.loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256
hours.per.week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# drop education and fnlwgt columns
data_features = All_data.drop(["education","fnlwgt"],axis=1)
print(data_features.head(5))


   age         workclass  education.num      marital.status  \
0   39         State-gov             13       Never-married   
1   50  Self-emp-not-inc             13  Married-civ-spouse   
2   38           Private              9            Divorced   
3   53           Private              7  Married-civ-spouse   
4   28           Private             13  Married-civ-spouse   

          occupation   relationship   race     sex  capital.gain  \
0       Adm-clerical  Not-in-family  White    Male          2174   
1    Exec-managerial        Husband  White    Male             0   
2  Handlers-cleaners  Not-in-family  White    Male             0   
3  Handlers-cleaners        Husband  Black    Male             0   
4     Prof-specialty           Wife  Black  Female             0   

   capital.loss  hours.per.week native.country income  
0             0              40  United-States  <=50K  
1             0              13  United-States  <=50K  
2             0              40  United-Stat

In [7]:
# count the number of missing values for each column
for attribute,count in zip(data_features.columns,(data_features.values.astype(str) == '?').sum(axis = 0)):
    if count > 0:
        print(attribute + ' has ' + str(count)+' missing value' )

workclass has 1836 missing value
occupation has 1843 missing value
native.country has 583 missing value


In [8]:
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# substitute unknown values of "workclass" column
# by using three different algorithms (Decision Tree, Logistic Regression, Random Forest) 
# if more than one algorithm predicted same value for the unknown, then the unknown will be substituted by the predicted value
# otherwise, the unknown value is substituted by most frequent value

data_features_encoded = pd.get_dummies(data_features, prefix=['marital.status', 'occupation','relationship','race','sex','native.country','income'], columns=['marital.status', 'occupation','relationship','race','sex','native.country','income'])

scaler = MinMaxScaler()
data_features_encoded[['age','education.num','capital.gain','capital.loss', 'hours.per.week']] = scaler.fit_transform(data_features_encoded[['age','education.num','capital.gain','capital.loss', 'hours.per.week']])

test_data = data_features_encoded[(data_features_encoded.workclass.values == '?')].copy()
test_label = test_data.workclass

train_data = data_features_encoded[(data_features_encoded.workclass.values != '?')].copy()
train_label = train_data.workclass

train_data.drop(columns = ['workclass'], inplace = True)
test_data.drop(columns = ['workclass'], inplace = True)


log_reg = LogisticRegression()
log_reg.fit(train_data, train_label)
log_reg_pred = log_reg.predict(test_data)


clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, train_label)
clf_pred = clf.predict(test_data)

r_forest = RandomForestClassifier(n_estimators=10)
r_forest.fit(train_data, train_label)
r_forest_pred = r_forest.predict(test_data)

majority_class = data_features.workclass.value_counts().index[0]

pred_df =  pd.DataFrame({'Rnadom Forest': r_forest_pred, 'Decision Tree' : clf_pred, 'Logistic Regression' : log_reg_pred})
overall_pred = pred_df.apply(lambda x: x.value_counts().index[0] if x.value_counts()[0] > 1 else majority_class, axis = 1)
data_features.loc[(data_features.workclass.values == '?'),'workclass'] = overall_pred.values

print(data_features.workclass.value_counts())
print(data_features.workclass.unique())

  return self.partial_fit(X, y)


Private             23961
Self-emp-not-inc     2599
Local-gov            2099
State-gov            1298
Self-emp-inc         1119
Federal-gov           961
Never-worked          510
Without-pay            14
Name: workclass, dtype: int64
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov'
 'Self-emp-inc' 'Never-worked' 'Without-pay']


In [9]:
data_features.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
# substitute unknown values of "occupation" column
# by using three different algorithms (Decision Tree, Logistic Regression, Random Forest) 
# if more than one algorithm predicted same value for the unknown, then the unknown will be substituted by the predicted value
# otherwise, the unknown value is substituted by most frequent value

data_features_encoded = pd.get_dummies(data_features, prefix=['workclass','marital.status','relationship','race','sex','native.country','income'], columns=['workclass','marital.status','relationship','race','sex','native.country','income'])

scaler = MinMaxScaler()
data_features_encoded[['age','education.num','capital.gain','capital.loss', 'hours.per.week']] = scaler.fit_transform(data_features_encoded[['age','education.num','capital.gain','capital.loss', 'hours.per.week']])

test_data = data_features_encoded[(data_features_encoded.occupation.values == '?')].copy()
test_label = test_data.occupation

train_data = data_features_encoded[(data_features_encoded.occupation.values != '?')].copy()
train_label = train_data.occupation

train_data.drop(columns = ['occupation'], inplace = True)
test_data.drop(columns = ['occupation'], inplace = True)


log_reg = LogisticRegression()
log_reg.fit(train_data, train_label)
log_reg_pred = log_reg.predict(test_data)


clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, train_label)
clf_pred = clf.predict(test_data)

r_forest = RandomForestClassifier(n_estimators=10)
r_forest.fit(train_data, train_label)
r_forest_pred = r_forest.predict(test_data)

majority_class = data_features.occupation.value_counts().index[0]

pred_df =  pd.DataFrame({'Rnadom Forest': r_forest_pred, 'Decision Tree' : clf_pred, 'Logistic Regression' : log_reg_pred})
overall_pred = pred_df.apply(lambda x: x.value_counts().index[0] if x.value_counts()[0] > 1 else majority_class, axis = 1)
data_features.loc[(data_features.occupation.values == '?'),'occupation'] = overall_pred.values

print(data_features.occupation.value_counts())
print(data_features.occupation.unique())

  return self.partial_fit(X, y)


Prof-specialty       4752
Craft-repair         4320
Exec-managerial      4141
Adm-clerical         4113
Sales                3727
Other-service        3586
Machine-op-inspct    2049
Transport-moving     1623
Handlers-cleaners    1399
Farming-fishing      1094
Tech-support          939
Protective-serv       660
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv']


In [11]:
# substitute unknown values of "native.country" column
# by using three different algorithms (Decision Tree, Logistic Regression, Random Forest) 
# if more than one algorithm predicted same value for the unknown, then the unknown will be substituted by the predicted value
# otherwise, the unknown value is substituted by most frequent value

data_features_encoded = pd.get_dummies(data_features, prefix=['workclass','marital.status','occupation','relationship','race','sex','income'], columns=['workclass','marital.status','occupation','relationship','race','sex','income'])

scaler = MinMaxScaler()
data_features_encoded[['age','education.num','capital.gain','capital.loss', 'hours.per.week']] = scaler.fit_transform(data_features_encoded[['age','education.num','capital.gain','capital.loss', 'hours.per.week']])


test_data = data_features_encoded[(data_features_encoded['native.country'].values == '?')].copy()
test_label = test_data['native.country']

train_data = data_features_encoded[(data_features_encoded['native.country'].values != '?')].copy()
train_label = train_data['native.country']

train_data.drop(columns = ['native.country'], inplace = True)
test_data.drop(columns = ['native.country'], inplace = True)


log_reg = LogisticRegression()
log_reg.fit(train_data, train_label)
log_reg_pred = log_reg.predict(test_data)


clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, train_label)
clf_pred = clf.predict(test_data)

r_forest = RandomForestClassifier(n_estimators=10)
r_forest.fit(train_data, train_label)
r_forest_pred = r_forest.predict(test_data)

majority_class = data_features['native.country'].value_counts().index[0]

pred_df =  pd.DataFrame({'Rnadom Forest': r_forest_pred, 'Decision Tree' : clf_pred, 'Logistic Regression' : log_reg_pred})
overall_pred = pred_df.apply(lambda x: x.value_counts().index[0] if x.value_counts()[0] > 1 else majority_class, axis = 1)
data_features.loc[(data_features['native.country'].values == '?'),'native.country'] = overall_pred.values

print(data_features['native.country'].value_counts())
print(data_features['native.country'].unique())

  return self.partial_fit(X, y)


United-States                 29707
Mexico                          649
Philippines                     220
Germany                         138
Canada                          121
Puerto-Rico                     115
El-Salvador                     106
India                           100
Cuba                             95
England                          91
South                            85
Jamaica                          81
China                            78
Italy                            74
Dominican-Republic               70
Vietnam                          69
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           52
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                           29
Greece                      

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Using ColumnTransformer for encoding and scaling the data
ct = ColumnTransformer([("scaling", StandardScaler(), ['age','education.num','capital.gain','capital.loss', 'hours.per.week']),("onehot", OneHotEncoder(sparse=False),['workclass','marital.status', 'occupation','relationship','race','sex','native.country'])])

In [13]:
# drop income column
data_features = data_features.drop(["income"], axis=1)

print(data_features.head()) 

# scaling and encoding the data
ct.fit(data_features)
data_features_trans = ct.transform(data_features)

# splitting the data
X_train, X_test, y_train, y_test = train_test_split(data_features_trans,All_data.income, test_size=0.3, random_state=1)


   age         workclass  education.num      marital.status  \
0   39         State-gov             13       Never-married   
1   50  Self-emp-not-inc             13  Married-civ-spouse   
2   38           Private              9            Divorced   
3   53           Private              7  Married-civ-spouse   
4   28           Private             13  Married-civ-spouse   

          occupation   relationship   race     sex  capital.gain  \
0       Adm-clerical  Not-in-family  White    Male          2174   
1    Exec-managerial        Husband  White    Male             0   
2  Handlers-cleaners  Not-in-family  White    Male             0   
3  Handlers-cleaners        Husband  Black    Male             0   
4     Prof-specialty           Wife  Black  Female             0   

   capital.loss  hours.per.week native.country  
0             0              40  United-States  
1             0              13  United-States  
2             0              40  United-States  
3             0 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  res = transformer.transform(X)


In [14]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(22792, 88)
(22792,)
(9769, 88)
(9769,)


In [15]:
# classifying using Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


print("Test score: {:f}".format(logreg.score(X_test, y_test)))




Test score: 0.854540


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

# Grid Search and Cross Validation with Logistic Regression
text_pipe_logit = make_pipeline(LogisticRegression(n_jobs=1,
                                                   random_state=7))

param_grid_logit = {'logisticregression__C': np.logspace(-10, 10, 40)}
grid_logit = GridSearchCV(text_pipe_logit, 
                          param_grid_logit, 
                          cv=5, n_jobs=-1)

grid_logit.fit(X_train, y_train)




GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=1,
          penalty='l2', random_state=7, solver='warn', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'logisticregression__C': array([1.00000e-10, 3.25702e-10, 1.06082e-09, 3.45511e-09, 1.12534e-08,
       3.66524e-08, 1.19378e-07, 3.88816e-07, 1.26638e-06, 4.12463e-06,
       1.34340e-05, 4.37548e-05, 1.42510e-04, 4.64159e-04, 1.51178e-03,
       4.92388e-03, 1.60372e-02, 5.22335e-02, 1... 8.37678e+06, 2.72833e+07,
       8.88624e+07, 2.89427e+08, 9.42668e+08, 3.07029e+09, 1.00000e+10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
# Best parameters for Logistic Regression 
grid_logit.best_params_, grid_logit.best_score_

({'logisticregression__C': 0.5541020330009481}, 0.849903474903475)

In [18]:
# Highest accuracy for testing data using Logistic Regression 
grid_logit.score(X_test, y_test)

0.8544375063977889

In [19]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# classifying using Decision Tree
tree = DecisionTreeClassifier(max_depth=5, random_state=17)
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)
accuracy_score(y_test, tree_pred)

0.8562800696079435

In [20]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# Grid Search and Cross Validation with Decision Tree
tree_params = {'max_depth': range(1,12),'max_features': range(4,89)}

tree_grid = GridSearchCV(tree, tree_params,cv=5, n_jobs=-1, verbose=True)

tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 935 candidates, totalling 4675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 4675 out of 4675 | elapsed:  6.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': range(1, 12), 'max_features': range(4, 89)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [21]:
# Best parameters for Decision Tree 
tree_grid.best_params_

{'max_depth': 9, 'max_features': 61}

In [22]:
tree_grid.best_score_ 

0.8562653562653563

In [23]:
# Highest accuracy for testing data using Decision Tree
accuracy_score(y_test, tree_grid.predict(X_test))

0.8619101238611936

In [24]:
# classifying using K Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
accuracy_score(y_test, knn_pred)

0.8518783908281298

In [25]:
from sklearn.pipeline import Pipeline

# Grid Search and Cross Validation with K Nearest Neighbors
knn_pipe = Pipeline([('knn', KNeighborsClassifier(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 15)}

knn_grid = GridSearchCV(knn_pipe, knn_params,cv=5, n_jobs=-1, verbose=True)

knn_grid.fit(X_train, y_train)

# Best parameters for K Nearest Neighbors 
knn_grid.best_params_, knn_grid.best_score_

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed: 20.3min finished


({'knn__n_neighbors': 14}, 0.842049842049842)

In [26]:
# Highest accuracy for testing data using K Nearest Neighbors 
accuracy_score(y_test, knn_grid.predict(X_test))

0.8526973078104207

In [27]:
from sklearn import svm

# classifying using Support Vector Machine with "rbf" kernel
svm_clf_linear = svm.SVC(kernel = 'rbf')
svm_clf_linear.fit(X_train, y_train)
svm_clf_linear_pred = svm_clf_linear.predict(X_test)
accuracy_score(y_test, svm_clf_linear_pred)



0.8585320913092436

In [28]:
# classifying using Support Vector Machine with "linear" kernel
svm_clf_linear = svm.SVC(kernel = 'linear')
svm_clf_linear.fit(X_train, y_train)
svm_clf_linear_pred = svm_clf_linear.predict(X_test)
accuracy_score(y_test, svm_clf_linear_pred)

0.8557682464940116

In [29]:
# Grid Search and Cross Validation with Support Vector Machine
k=['rbf', 'linear','poly','sigmoid']
c= range(1,100)
g=np.arange(1e-4,1e-2,0.0001)
g=g.tolist()
param_grid=dict(kernel=k, C=c, gamma=g)

svr=svm.SVC()
grid = GridSearchCV(svr, param_grid, cv=5,scoring='accuracy')
grid.fit(X_train, y_train) 

KeyboardInterrupt: 

In [None]:
grid.best_score_ 

In [None]:
# Highest accuracy for testing data using Support Vector Machine
accuracy_score(y_test, grid.predict(X_test))