In [47]:
%matplotlib inline

In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score,f1_score,classification_report, make_scorer

In [49]:
income_data = pd.read_csv("adult.data",header=None,sep=", ",engine = "python")

In [50]:
income_data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income_class']


In [51]:
income_attributes = income_data.drop(columns = ["income_class"])
income_classes = income_data.income_class

In [52]:
income_attributes = pd.get_dummies(income_attributes)

In [53]:
scaler = MinMaxScaler()
income_attributes_scaled = scaler.fit_transform(income_attributes)

In [54]:
income_attributes_train,income_attributes_test, income_classes_train,income_classes_test = train_test_split(
    income_attributes_scaled,income_classes,test_size= 0.20,stratify = income_classes)

In [55]:
income_attributes_train.shape, income_attributes_test.shape,income_classes_train.shape, income_classes_test.shape

((26048, 108), (6513, 108), (26048,), (6513,))

In [56]:
classifier =LinearSVC(
    C=1e6,
    max_iter=10000
)

In [57]:
classifier.fit(income_attributes_train,income_classes_train)



LinearSVC(C=1000000.0, max_iter=10000)

In [58]:
classifier.coef_

array([[ 6.96134132e-01,  2.93209398e-01,  1.20772819e+00,
         1.06677648e+01,  1.31854158e+00,  1.08760113e+00,
        -8.77769930e-02,  1.82070603e-01, -1.38853371e-01,
        -1.41906041e-01,  1.47495066e-01, -3.79763723e-02,
        -3.74350929e-02, -2.80872212e-01, -9.35622806e-01,
        -3.31446994e-01, -3.25789842e-01, -1.81828507e-03,
         3.36517040e-01,  3.43795603e-01,  3.56823687e-01,
         1.68353048e-01, -1.86625610e-01,  1.82934966e-02,
        -1.71536407e-01,  2.48266045e-01,  7.01580036e-02,
         4.58359759e-01, -3.40519483e+00,  5.67674557e-01,
         5.23293507e-01, -6.91660956e-01,  3.42385444e-01,
         3.53882082e-01, -9.82275907e-02, -6.45662441e-01,
        -2.86256124e-01, -3.05337634e-01, -2.29683034e-01,
         4.17405152e-01, -3.04278369e-01,  4.30273100e-01,
        -1.67985068e-01, -6.98960584e-01, -1.33995644e-01,
         3.32781224e-02, -3.93513903e-01, -1.11406827e+00,
         3.21458515e-01, -1.23043212e-01,  5.62431369e-0

In [61]:
linear_grid_search = GridSearchCV(
    estimator=LinearSVC(max_iter=1000),
    param_grid= {
    "C":[0.01,0.1,1,10,100],
    "loss":["hinge","squared_hinge"]
}, 
scoring= make_scorer(f1_score,pos_label = ">50K"))

In [62]:
linear_grid_search.fit(income_attributes_train,income_classes_train)



GridSearchCV(estimator=LinearSVC(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'loss': ['hinge', 'squared_hinge']},
             scoring=make_scorer(f1_score, pos_label=>50K))

In [63]:
linear_grid_search.best_params_

{'C': 10, 'loss': 'squared_hinge'}

In [64]:
linear_grid_search.cv_results_

{'mean_fit_time': array([0.0835732 , 0.10112658, 0.12069263, 0.17241335, 0.20907578,
        0.60783582, 0.67011786, 1.86493578, 2.27679138, 3.02188406]),
 'std_fit_time': array([0.00954166, 0.01305878, 0.01214585, 0.00535353, 0.01605017,
        0.07081902, 0.02308996, 0.15024146, 0.13349107, 0.32543972]),
 'mean_score_time': array([0.03852758, 0.03766727, 0.05520077, 0.05736833, 0.05848083,
        0.05722814, 0.05183859, 0.04086599, 0.03998704, 0.04438491]),
 'std_score_time': array([0.01206299, 0.00498442, 0.00515389, 0.00896251, 0.00507355,
        0.00581326, 0.00874862, 0.00870936, 0.00870625, 0.01048651]),
 'param_C': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_loss': masked_array(data=['hinge', 'squared_hinge', 'hinge', 'squared_hinge',
                    'hinge', 'squared_hinge', 'hing

In [72]:
train_pred = linear_grid_search.best_estimator_.predict(income_attributes_train)

In [73]:
test_pred = linear_grid_search.best_estimator_.predict(income_attributes_test)

In [74]:
f1_score(income_classes_test,test_pred, pos_label = ">50K")

0.6537785588752197

In [75]:
print(classification_report(income_classes_train,train_pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91     19775
        >50K       0.74      0.60      0.67      6273

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.79     26048
weighted avg       0.85      0.85      0.85     26048



In [76]:
print(classification_report(income_classes_test,test_pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      4945
        >50K       0.73      0.59      0.65      1568

    accuracy                           0.85      6513
   macro avg       0.80      0.76      0.78      6513
weighted avg       0.84      0.85      0.84      6513



In [80]:
svc = SVC(
    kernel="poly",degree= 2, max_iter=1000,
)

In [81]:
svc.fit(income_attributes_train,income_classes_train)



SVC(degree=2, kernel='poly', max_iter=1000)

In [85]:
svc.decision_function(income_attributes_train[:10])

array([ 0.15085713,  0.56057649, -0.52628323, -0.16584765, -0.53738853,
       -0.19909835,  0.30277818,  0.47250739, -0.53929232,  0.17807932])

In [86]:
svc.predict(income_attributes_train[:10])

array(['>50K', '>50K', '<=50K', '<=50K', '<=50K', '<=50K', '>50K', '>50K',
       '<=50K', '>50K'], dtype=object)

In [87]:
train_predictions= svc.predict(income_attributes_train)

In [88]:
test_predictions= svc.predict(income_attributes_test)

In [89]:
print(classification_report(income_classes_train,train_predictions))

              precision    recall  f1-score   support

       <=50K       0.91      0.66      0.77     19775
        >50K       0.42      0.79      0.55      6273

    accuracy                           0.69     26048
   macro avg       0.67      0.72      0.66     26048
weighted avg       0.79      0.69      0.71     26048



In [90]:
print(classification_report(income_classes_test,test_predictions))

              precision    recall  f1-score   support

       <=50K       0.91      0.66      0.76      4945
        >50K       0.42      0.78      0.55      1568

    accuracy                           0.69      6513
   macro avg       0.66      0.72      0.65      6513
weighted avg       0.79      0.69      0.71      6513



In [91]:
gaussian_svc = SVC(
    kernel="rbf",
    C = 100,
    gamma= 0.1
)
gaussian_svc.fit(income_attributes_train,income_classes_train)
train_predictions= gaussian_svc.predict(income_attributes_train)
test_predictions= gaussian_svc.predict(income_attributes_test)

In [92]:
print(classification_report(income_classes_train,train_predictions))

              precision    recall  f1-score   support

       <=50K       0.91      0.95      0.93     19775
        >50K       0.81      0.72      0.76      6273

    accuracy                           0.89     26048
   macro avg       0.86      0.83      0.85     26048
weighted avg       0.89      0.89      0.89     26048



In [93]:
print(classification_report(income_classes_test,test_predictions))

              precision    recall  f1-score   support

       <=50K       0.88      0.91      0.90      4945
        >50K       0.69      0.61      0.65      1568

    accuracy                           0.84      6513
   macro avg       0.79      0.76      0.77      6513
weighted avg       0.84      0.84      0.84      6513



In [99]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(income_attributes_train,income_classes_train)

KNeighborsClassifier(n_neighbors=20)

In [100]:
knn.predict(income_attributes_train[:10])

In [102]:
f1_score(income_classes_test,knn.predict(income_attributes_test),pos_label= ">50K")

0.6093418259023355