In [0]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics
    
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB


from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import label_binarize
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))


In [0]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num','marital-status', 'occupation', 'relationship', 'race', 'gender','capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income']

train = pd.read_csv('adult_data.txt', sep=",\s", header=None, names = column_names, engine = 'python')
test = pd.read_csv('adult_test.txt', sep=",\s", header=None, names = column_names, engine = 'python')
test['income'].replace(regex=True,inplace=True,to_replace=r'\.',value=r'')


adult = pd.concat([test,train])
adult.reset_index(inplace = True, drop = True)

In [7]:
# Setting all the categorical columns to type category
for col in set(adult.columns) - set(adult.describe().columns):
    adult[col] = adult[col].astype('category')
    
print(adult.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age                48842 non-null int64
workclass          48842 non-null category
fnlwgt             48842 non-null int64
education          48842 non-null category
educational-num    48842 non-null int64
marital-status     48842 non-null category
occupation         48842 non-null category
relationship       48842 non-null category
race               48842 non-null category
gender             48842 non-null category
capital-gain       48842 non-null int64
capital-loss       48842 non-null int64
hours-per-week     48842 non-null int64
native-country     48842 non-null category
income             48842 non-null category
dtypes: category(9), int64(6)
memory usage: 2.7 MB
None


In [0]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [8]:
adult_data = adult.drop(columns = ['income'])
adult_label = adult.income


adult_cat_1hot = pd.get_dummies(adult_data.select_dtypes('category'))
adult_non_cat = adult_data.select_dtypes(exclude = 'category')

adult_data_1hot = pd.concat([adult_non_cat, adult_cat_1hot], axis=1, join='inner')

train_data, test_data, train_label, test_label = train_test_split(adult_data_1hot, adult_label, train_size  = 5000)

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

scaler.fit(train_data)  
train_data = scaler.transform(train_data)  

test_data = scaler.transform(test_data)

  return self.partial_fit(X, y)
  app.launch_new_instance()


In [0]:
C = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 0.01, 0.1, 1, 10, 100, 1000]
# Linear kernel

def lin_svm(C):
  a=dict()

  for C_val in C:  
    svm_clf_linear = svm.SVC(kernel = 'linear', C=C_val)
    svm_clf_linear.fit(train_data, train_label)
    svm_clf_linear_pred = svm_clf_linear.predict(test_data)
    svm_clf_linear_score=svm_clf_linear.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, svm_clf_linear_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[C_val]=(svm_clf_linear_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

lin_svm(C)

  del sys.path[0]
  del sys.path[0]


Accuracy    0.820649
F1          0.555178
dtype: float64

In [0]:
C = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 0.01, 0.1, 1, 10, 100, 1000]

# Poly kernel degree 2

def poly2_svm(C):
  a=dict()
  for C_val in C:  
    svm_clf_poly2 = svm.SVC(kernel = 'poly', C=C_val, degree = 2)
    svm_clf_poly2.fit(train_data, train_label)
    svm_clf_poly2_pred = svm_clf_poly2.predict(test_data)
    svm_clf_poly2_score=svm_clf_poly2.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, svm_clf_poly2_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[C_val]=(svm_clf_poly2_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

poly2_svm(C)

  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


Accuracy    0.784506
F1          0.413289
dtype: float64

SVM

In [0]:
C = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 0.01, 0.1, 1, 10, 100, 1000]


# Poly kernel degree 3
def poly3_svm(C):
  a=dict()
  for C_val in C:  
    svm_clf_poly3 = svm.SVC(kernel = 'poly', C=C_val, degree = 3)
    svm_clf_poly3.fit(train_data, train_label)
    svm_clf_poly3_pred = svm_clf_poly3.predict(test_data)
    svm_clf_poly3_score=svm_clf_poly3.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, svm_clf_poly3_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[C_val]=(svm_clf_poly3_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

poly3_svm(C)

  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


Accuracy    0.782200
F1          0.334014
dtype: float64

In [0]:
C = [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 0.01, 0.1, 1, 10, 100, 1000]
gammas = [0.001,0.005,0.01,0.05,0.1,0.5,1,2]  


# rbf kernel
def rbf_svm(C, gammas):
  a=dict()
  for C_val in C:  
    for gam in gammas:
      svm_clf_rbf = svm.SVC(kernel = 'rbf', C = C_val, gamma=gam)
      svm_clf_rbf.fit(train_data, train_label)
      svm_clf_rbf_pred = svm_clf_rbf.predict(test_data)
      svm_clf_rbf_score=svm_clf_rbf.score(test_data, test_label)
      tn, fp, fn, tp = metrics.confusion_matrix(test_label, svm_clf_rbf_pred).ravel()
      precision = (tp)/(tp+fp)
      recall = (tp)/(tp+fn)
      f1 = (2*recall*precision)/(recall+precision)
      a[str(C_val)+','+str(gam)]=(svm_clf_rbf_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

rbf_svm(C, gammas)
    

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel

Accuracy    0.780271
F1          0.442492
dtype: float64

In [0]:
hidd_units = [1,2,4,8,32,128]
momentums = [0,0.2,0.5,0.9]


def ANN (hidd_units, momentums):
  a= dict()
  for i, hid in enumerate(hidd_units):
    for j,mom in enumerate(momentums):
      ann = MLPClassifier(solver = 'sgd', hidden_layer_sizes=(hid, ), momentum = mom)
      ann.fit(train_data, train_label)                         
      ann_pred = ann.predict(test_data)
      ann_pred_prob = ann.predict_proba(test_data)
      ann_score=ann.score(test_data, test_label)
      tn, fp, fn, tp = metrics.confusion_matrix(test_label, ann_pred).ravel()
      precision = (tp)/(tp+fp)
      recall = (tp)/(tp+fn)
      f1 = (2*recall*precision)/(recall+precision)
      a[str(hid)+','+str(mom)]=(ann_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

ANN (hidd_units, momentums)

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


Accuracy    0.818805
F1          0.562039
dtype: float64

In [0]:
Cs = [10e-8, 10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 0.01, 0.1, 1, 10, 100, 1000, 10000]

def logreg(C):
  a = dict()

  for  C_val in C: 
    clr = LogisticRegression(C=C_val).fit(train_data, train_label)
    clr_pred = clr.predict(test_data)
    clr_pred_prob=clr.predict_proba(test_data)
    clr_score=clr.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, clr_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[C_val]=(clr_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

logreg(Cs)




Accuracy    0.813484
F1          0.648397
dtype: float64

In [16]:
K = np.linspace(1, 5000, 26).astype(int)

def KNN(K):
  a=dict()
  for k_val in K: 
    knn_clf = KNeighborsClassifier(n_neighbors=k_val).fit(train_data, train_label)
    knn_clf_pred = knn_clf.predict(test_data)
    knn_clf_pred_prob=knn_clf.predict_proba(test_data)
    knn_clf_score=knn_clf.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, knn_clf_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[k_val]=(knn_clf_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

KNN(K)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is ad

Accuracy    0.767873
F1          0.284860
dtype: float64

In [0]:
def nb1():
  nb1_clf = GaussianNB().fit(train_data, train_label)
  nb1_clf_pred = nb1_clf.predict(test_data)
  nb1_clf_pred_prob=nb1_clf.predict_proba(test_data)
  nb1_clf_score=nb1_clf.score(test_data, test_label)
  tn, fp, fn, tp = metrics.confusion_matrix(test_label, nb1_clf_pred).ravel()
  precision = (tp)/(tp+fp)
  recall = (tp)/(tp+fn)
  f1 = (2*recall*precision)/(recall+precision)
  return (nb1_clf_score, f1)

nb1()

(0.3748232288672962, 0.4287768584707083)

In [0]:
def nb2():
  nb2_clf = BernoulliNB().fit(train_data, train_label)
  nb2_clf_pred = nb2_clf.predict(test_data)
  nb2_clf_pred_prob=nb2_clf.predict_proba(test_data)
  nb2_clf_score=nb2_clf.score(test_data, test_label)
  tn, fp, fn, tp = metrics.confusion_matrix(test_label, nb2_clf_pred).ravel()
  precision = (tp)/(tp+fp)
  recall = (tp)/(tp+fn)
  f1 = (2*recall*precision)/(recall+precision)
  return (nb2_clf_score, f1)

nb2()

(0.7810546964098353, 0.6326304106548278)

In [0]:
num_features = [1,2,4,6,8,12,16,20]

def RF(nun_features):
  a=dict()
  for n in num_features:
    rand_forest = RandomForestClassifier(n_estimators=1024, max_features = n).fit(train_data, train_label)
    rand_forest_pred = rand_forest.predict(test_data)
    rand_forest_pred_prob=rand_forest.predict_proba(test_data)
    rand_forest_score=rand_forest.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, rand_forest_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[n]=(rand_forest_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

RF(num_features)

Accuracy    0.846189
F1          0.652351
dtype: float64

In [11]:
def DT_gini():
  a=dict()
  dt_gini = tree.DecisionTreeClassifier(criterion = 'gini')
  dt_gini = dt_gini.fit(train_data, train_label)
  dt_gini_pred = dt_gini.predict(test_data)
  dt_gini_score=dt_gini.score(test_data, test_label)
  tn, fp, fn, tp = metrics.confusion_matrix(test_label, dt_gini_pred).ravel()
  precision = (tp)/(tp+fp)
  recall = (tp)/(tp+fn)
  f1 = (2*recall*precision)/(recall+precision)
  return (dt_gini_score, f1)

DT_gini()

(0.8048446694950048, 0.6054597436133912)

In [12]:
def DT_entropy():
  a=dict()
  dt_entropy = tree.DecisionTreeClassifier(criterion = 'entropy')
  dt_entropy = dt_entropy.fit(train_data, train_label)
  dt_entropy_pred = dt_entropy.predict(test_data)
  dt_entropy_score=dt_entropy.score(test_data, test_label)
  tn, fp, fn, tp = metrics.confusion_matrix(test_label, dt_entropy_pred).ravel()
  precision = (tp)/(tp+fp)
  recall = (tp)/(tp+fn)
  f1 = (2*recall*precision)/(recall+precision)
  return (dt_entropy_score, f1)

DT_entropy()

(0.8124173167282515, 0.6072963422786744)

In [14]:
criterion = ['gini', 'entropy']

def Bag_dt(crit):
  a=dict()
  for i, cr in enumerate(crit):
    bg_dt = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(criterion = cr), n_estimators=100)
    bg_dt = bg_dt.fit(train_data, train_label)
    bg_dt_pred = bg_dt.predict(test_data)
    bg_dt_score=bg_dt.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, bg_dt_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[i]=(bg_dt_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

Bag_dt(criterion)


Accuracy    0.848501
F1          0.659609
dtype: float64

In [15]:
criterion = ['gini', 'entropy']

def ADA_boost(crit):
  a=dict()
  for i, cr in enumerate(crit):
    ada = AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(criterion = cr), n_estimators=100)
    ada = ada.fit(train_data, train_label)
    ada_pred = ada.predict(test_data)
    ada_score=ada.score(test_data, test_label)
    tn, fp, fn, tp = metrics.confusion_matrix(test_label, ada_pred).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*recall*precision)/(recall+precision)
    a[i]=(ada_score, f1)
  return pd.DataFrame.from_dict(a, orient='index', columns=['Accuracy', 'F1']).mean(axis=0)

ADA_boost(criterion)



Accuracy    0.808745
F1          0.606701
dtype: float64