In [10]:
%matplotlib inline

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.svm import LinearSVC,SVC,OneClassSVM
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score,f1_score,classification_report, make_scorer

In [12]:
income_data = pd.read_csv("adult.data",header=None,sep=", ",engine = "python")

In [13]:
income_data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income_class']


In [14]:
income_attributes = income_data.drop(columns = ["income_class"])
income_classes = income_data.income_class

In [15]:
income_attributes = pd.get_dummies(income_attributes)

In [16]:
scaler = MinMaxScaler()
income_attributes_scaled = scaler.fit_transform(income_attributes)

In [17]:
income_attributes_train,income_attributes_test, income_classes_train,income_classes_test = train_test_split(
    income_attributes_scaled,income_classes,test_size= 0.20,stratify = income_classes)

In [18]:
income_attributes_train.shape, income_attributes_test.shape,income_classes_train.shape, income_classes_test.shape

((26048, 108), (6513, 108), (26048,), (6513,))

In [19]:
classifier =LinearSVC(
    C=1e6,
    max_iter=10000
)

In [20]:
classifier.fit(income_attributes_train,income_classes_train)



LinearSVC(C=1000000.0, max_iter=10000)

In [21]:
classifier.coef_

array([[ 7.59146749e-01,  3.20417853e-01,  1.25532133e+00,
         1.03579839e+01,  8.29082156e-01,  7.79218005e-01,
         1.02827400e-01, -8.60208950e-02,  3.57905320e-01,
        -3.19531462e-01, -3.62672712e-01,  1.14199935e-01,
        -2.81654855e-01,  1.99981555e-01, -1.09251257e+00,
        -1.05640073e-02, -2.81948387e-01,  4.47975637e-02,
         2.23009679e-01,  3.79440530e-01, -2.10588358e-01,
         4.49421852e-01, -3.68623152e-01,  8.00457331e-01,
        -1.29207697e-01,  3.26816584e-01,  8.89735222e-02,
         6.26853700e-01, -3.47594513e+00,  8.47847594e-02,
         8.48429265e-02, -4.39995418e-01,  1.69113077e-01,
         8.44966402e-02, -6.51779579e-01, -2.37244716e-01,
        -3.69526331e-01,  7.74580415e-02, -2.16704061e-01,
         2.43534014e-02, -3.78399912e-01, -1.26289679e-01,
         7.64177364e-02,  1.88541459e-02, -2.43937144e-01,
        -2.11182612e-01, -6.13676983e-02, -1.00662728e+00,
         1.81495767e-01,  3.29992229e-01, -2.36778615e-0

In [22]:
linear_grid_search = GridSearchCV(
    estimator=LinearSVC(max_iter=1000),
    param_grid= {
    "C":[0.01,0.1,1,10,100],
    "loss":["hinge","squared_hinge"]
}, 
scoring= make_scorer(f1_score,pos_label = ">50K"))

In [23]:
linear_grid_search.fit(income_attributes_train,income_classes_train)



GridSearchCV(estimator=LinearSVC(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'loss': ['hinge', 'squared_hinge']},
             scoring=make_scorer(f1_score, pos_label=>50K))

In [24]:
linear_grid_search.best_params_

{'C': 100, 'loss': 'hinge'}

In [25]:
linear_grid_search.cv_results_

{'mean_fit_time': array([0.12382703, 0.12322125, 0.10721617, 0.1685719 , 0.19398465,
        0.610816  , 0.66623168, 2.3444818 , 2.8963182 , 2.89678245]),
 'std_fit_time': array([0.01757235, 0.01791987, 0.0079396 , 0.00556322, 0.01198272,
        0.0607006 , 0.03458458, 0.11678158, 0.28845832, 0.07693671]),
 'mean_score_time': array([0.07680364, 0.06333933, 0.05840263, 0.06439395, 0.05394769,
        0.05702481, 0.05089002, 0.05427518, 0.05743537, 0.05822539]),
 'std_score_time': array([0.01640542, 0.00744745, 0.00625129, 0.01223588, 0.00928012,
        0.01405204, 0.00514774, 0.00582716, 0.01011257, 0.008753  ]),
 'param_C': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_loss': masked_array(data=['hinge', 'squared_hinge', 'hinge', 'squared_hinge',
                    'hinge', 'squared_hinge', 'hing

In [26]:
train_pred = linear_grid_search.best_estimator_.predict(income_attributes_train)

In [27]:
test_pred = linear_grid_search.best_estimator_.predict(income_attributes_test)

In [28]:
f1_score(income_classes_test,test_pred, pos_label = ">50K")

0.6492774057102573

In [29]:
print(classification_report(income_classes_train,train_pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90     19775
        >50K       0.71      0.59      0.64      6273

    accuracy                           0.84     26048
   macro avg       0.79      0.76      0.77     26048
weighted avg       0.84      0.84      0.84     26048



In [30]:
print(classification_report(income_classes_test,test_pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      4945
        >50K       0.73      0.59      0.65      1568

    accuracy                           0.85      6513
   macro avg       0.80      0.76      0.78      6513
weighted avg       0.84      0.85      0.84      6513



In [31]:
svc = SVC(
    kernel="poly",degree= 2, max_iter=1000,
)

In [32]:
svc.fit(income_attributes_train,income_classes_train)



SVC(degree=2, kernel='poly', max_iter=1000)

In [33]:
svc.decision_function(income_attributes_train[:10])

array([ 0.03017883, -0.22328199,  0.30471297, -0.47608535, -0.30897285,
        0.21370687,  0.10778928, -0.20271596,  0.0386054 ,  0.16317667])

In [34]:
svc.predict(income_attributes_train[:10])

array(['>50K', '<=50K', '>50K', '<=50K', '<=50K', '>50K', '>50K', '<=50K',
       '>50K', '>50K'], dtype=object)

In [35]:
train_predictions= svc.predict(income_attributes_train)

In [36]:
test_predictions= svc.predict(income_attributes_test)

In [37]:
print(classification_report(income_classes_train,train_predictions))

              precision    recall  f1-score   support

       <=50K       0.89      0.71      0.79     19775
        >50K       0.44      0.72      0.55      6273

    accuracy                           0.71     26048
   macro avg       0.66      0.72      0.67     26048
weighted avg       0.78      0.71      0.73     26048



In [38]:
print(classification_report(income_classes_test,test_predictions))

              precision    recall  f1-score   support

       <=50K       0.89      0.71      0.79      4945
        >50K       0.44      0.71      0.54      1568

    accuracy                           0.71      6513
   macro avg       0.66      0.71      0.67      6513
weighted avg       0.78      0.71      0.73      6513



In [39]:
gaussian_svc = SVC(
    kernel="rbf",
    C = 100,
    gamma= 0.1
)
gaussian_svc.fit(income_attributes_train,income_classes_train)
train_predictions= gaussian_svc.predict(income_attributes_train)
test_predictions= gaussian_svc.predict(income_attributes_test)

In [40]:
print(classification_report(income_classes_train,train_predictions))

              precision    recall  f1-score   support

       <=50K       0.91      0.95      0.93     19775
        >50K       0.81      0.70      0.75      6273

    accuracy                           0.89     26048
   macro avg       0.86      0.83      0.84     26048
weighted avg       0.89      0.89      0.89     26048



In [41]:
print(classification_report(income_classes_test,test_predictions))

              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      4945
        >50K       0.71      0.60      0.65      1568

    accuracy                           0.84      6513
   macro avg       0.79      0.76      0.78      6513
weighted avg       0.84      0.84      0.84      6513



In [42]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(income_attributes_train,income_classes_train)

KNeighborsClassifier(n_neighbors=20)

In [43]:
knn.predict(income_attributes_train[:10])

array(['>50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '>50K'], dtype=object)

In [44]:
f1_score(income_classes_test,knn.predict(income_attributes_test),pos_label= ">50K")

0.6263498920086393

In [45]:
one_class_svm = OneClassSVM(nu = 0.1 )

In [46]:
one_class_svm.fit(income_attributes_train)

OneClassSVM(nu=0.1)