In [1]:
%matplotlib inline

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, LinearSVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer

# Support Vector Machines Demo

In [3]:
income_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep=", ", engine="python", header=None),
income_data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "material-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income_class"]
income_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,material-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
income_attributes = income_data.drop(columns="income_class")
income_classes = income_data.income_class

In [5]:
income_attributes = pd.get_dummies(income_attributes)
scalar = MinMaxScaler()
income_attributes = scalar.fit_transform(income_attributes)


In [6]:
income_attributes_train, income_attributes_test, income_classes_train, income_classes_test = train_test_split(
    income_attributes,
    income_classes,
    train_size=0.8, 
    test_size=0.2, 
    random_state=42, 
    stratify=income_classes
)
income_attributes_train.shape, income_attributes_test.shape, income_classes_train.shape, income_classes_test.shape 

((26048, 108), (6513, 108), (26048,), (6513,))

### Linear Support Vector Classifier

In [9]:
linear_classifier = LinearSVC(C=1e6, max_iter=100000000)
linear_classifier.fit(income_attributes_train, income_classes_train)



In [10]:
linear_classifier.coef_

array([[ 0.70865632,  0.31175299,  1.04663042, 10.28434569,  0.96392782,
         1.16951666,  0.25516773,  0.28257123,  0.09737284, -0.39044027,
        -0.42899436,  0.11896425,  0.01195889, -0.28579883, -0.8417905 ,
        -0.18300023,  0.60105403, -0.16613483,  0.58555533,  0.35854768,
        -0.11938331, -0.03897059, -0.1252781 ,  0.10474865,  0.24349642,
         0.18876753, -0.1182322 ,  0.14559004, -3.24505801,  0.34539427,
         0.2419143 , -0.45268877,  0.39246674,  0.21672904, -0.44672441,
        -0.30251046, -0.44454799, -0.14371318, -0.13527255, -0.0699829 ,
        -0.44631254, -0.11718328,  0.19404344, -0.43993744, -0.31368805,
         0.09200671, -0.12404812, -0.94576969,  0.30439012,  0.17520968,
         0.25631414,  0.31508953,  0.07415191, -0.3912642 , -0.12305889,
        -0.34291543, -0.74956909,  0.09898597,  0.3268326 , -0.28982274,
        -0.33637065, -0.21577506,  0.02955288, -0.36857347, -0.52082096,
        -0.66016807, -0.04002653,  0.6886715 ,  0.2

### Grid Search CV

In [16]:
linear_grid_search = GridSearchCV(
    LinearSVC(max_iter = 100000000),
    param_grid = {
        "C": [0.01, 0.1, 1, 10, 100],
        "loss": ["hinge", "squerd_error"]
    },
    scoring = make_scorer(f1_score, pos_label=">50K")
)

linear_grid_search.fit(income_attributes_train, income_classes_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Boyko Boev\SoftUni-Machine-Learning-September-2022\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Boyko Boev\SoftUni-Machine-Learning-September-2022\.venv\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Boyko Boev\SoftUni-Machine-Learning-September-2022\.venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Boyko Boev\SoftUni-Machine-Learning-Septe

In [18]:
linear_grid_search.best_estimator_

In [19]:
linear_grid_search.best_params_

{'C': 10, 'loss': 'hinge'}

In [20]:
linear_grid_search.cv_results_

{'mean_fit_time': array([5.77161312e-02, 7.73735046e-03, 1.02680206e-01, 5.83643913e-03,
        4.03197479e-01, 6.70862198e-03, 1.79284773e+00, 6.99162483e-03,
        1.58643176e+01, 9.98978615e-03]),
 'std_fit_time': array([6.56613931e-03, 7.82144525e-03, 1.99670832e-02, 3.03408338e-03,
        1.72133356e-01, 8.06179447e-04, 1.91520642e-01, 5.01784625e-04,
        3.37356974e+00, 8.39389061e-03]),
 'mean_score_time': array([0.02510543, 0.        , 0.02730565, 0.        , 0.02904515,
        0.        , 0.02823625, 0.        , 0.02223635, 0.        ]),
 'std_score_time': array([0.00748433, 0.        , 0.0068157 , 0.        , 0.00239988,
        0.        , 0.00124121, 0.        , 0.00579228, 0.        ]),
 'param_C': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_loss': masked_array(data=['hinge'

In [26]:
train_predictions = linear_grid_search.best_estimator_.predict(income_attributes_train)
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91     19775
        >50K       0.75      0.58      0.65      6273

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048



In [27]:
test_predictions = linear_grid_search.best_estimator_.predict(income_attributes_test)
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      4945
        >50K       0.76      0.60      0.67      1568

    accuracy                           0.86      6513
   macro avg       0.82      0.77      0.79      6513
weighted avg       0.85      0.86      0.85      6513



### Support Vector Classifier

In [35]:
svc = SVC(kernel="poly", degree=2, max_iter=10000, C=1)
svc.fit(income_attributes_train, income_classes_train)



In [36]:
svc.decision_function(income_attributes_train[:10])

array([ 0.94913622, -0.86488077, -1.16708423, -1.23713722, -1.14645301,
        0.74583841, -0.5487911 , -0.19149083,  2.12116332, -1.74647849])

In [37]:
svc.predict(income_attributes_train[:10])

array(['>50K', '<=50K', '<=50K', '<=50K', '<=50K', '>50K', '<=50K',
       '<=50K', '>50K', '<=50K'], dtype=object)

In [38]:
train_predictions = svc.predict(income_attributes_train)
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.87      0.94      0.90     19775
        >50K       0.74      0.55      0.63      6273

    accuracy                           0.85     26048
   macro avg       0.81      0.74      0.77     26048
weighted avg       0.84      0.85      0.84     26048



In [39]:
test_predictions = svc.predict(income_attributes_test)
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.87      0.94      0.90      4945
        >50K       0.74      0.56      0.64      1568

    accuracy                           0.85      6513
   macro avg       0.81      0.75      0.77      6513
weighted avg       0.84      0.85      0.84      6513



In [40]:
gaussian_svc = SVC(kernel="rbf", gamma=0.1, C=100)
gaussian_svc.fit(income_attributes_train, income_classes_train)

In [41]:
train_predictions = gaussian_svc.predict(income_attributes_train)
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.91      0.95      0.93     19775
        >50K       0.82      0.70      0.76      6273

    accuracy                           0.89     26048
   macro avg       0.86      0.83      0.84     26048
weighted avg       0.89      0.89      0.89     26048



In [42]:
test_predictions = gaussian_svc.predict(income_attributes_test)
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.88      0.91      0.90      4945
        >50K       0.69      0.60      0.64      1568

    accuracy                           0.84      6513
   macro avg       0.78      0.76      0.77      6513
weighted avg       0.83      0.84      0.84      6513



### K Neighbors Classifier

In [45]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(income_attributes_train, income_classes_train)

In [46]:
train_predictions = knn.predict(income_attributes_train)
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     19775
        >50K       0.73      0.57      0.64      6273

    accuracy                           0.85     26048
   macro avg       0.80      0.75      0.77     26048
weighted avg       0.84      0.85      0.84     26048



In [47]:
test_predictions = knn.predict(income_attributes_test)
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.87      0.92      0.90      4945
        >50K       0.70      0.55      0.62      1568

    accuracy                           0.84      6513
   macro avg       0.78      0.74      0.76      6513
weighted avg       0.83      0.84      0.83      6513



### One-Class SVM

In [49]:
one_class_svm = OneClassSVM(nu = 0.1)
one_class_svm.fit(income_attributes_train)

In [51]:
train_predictions = one_class_svm.predict(income_attributes_train)
train_predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)