In [1]:
# DESCRIPTION

# Load the data from “college.csv” that has attributes collected about private and public colleges for a 
# particular year. Predict the private/public status of the colleges from other attributes. Use LabelEncoder 
# to encode the target variable to numerical form. Split the data such that 20% of the data is set aside for 
# testing. Fit a linear SVM from scikit-learn and observe the accuracy. [Hint: Use Linear SVC] Preprocess the 
# data using StandardScalar and fit the same model again. Observe the change in accuracy.  Use scikit-learn’s 
# gridsearch to select the best hyper-parameter for a non-linear SVM. Identify the model with the best score and 
# its parameters. [Hint: Refer to model_selection module of Scikit learn]
 

# Objective: Employ SVM from scikit-learn for binary classification and measure the impact of preprocessing data 
# and hyper-parameter search using grid search.

In [2]:
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# IMPORT DATASET
df = pd.read_csv("Lesson 6-Supervised Learning-Classification/College.csv")

In [4]:
# DATA EXPLORATION
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    object 
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: float64(1), int64(16), object(1)
memory usage: 

In [5]:
df.describe()

Unnamed: 0,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
count,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0,777.0
mean,3001.638353,2018.804376,779.972973,27.558559,55.796654,3699.907336,855.298584,10440.669241,4357.526384,549.380952,1340.642214,72.660232,79.702703,14.089704,22.743887,9660.171171,65.46332
std,3870.201484,2451.113971,929.17619,17.640364,19.804778,4850.420531,1522.431887,4023.016484,1096.696416,165.10536,677.071454,16.328155,14.722359,3.958349,12.391801,5221.76844,17.17771
min,81.0,72.0,35.0,1.0,9.0,139.0,1.0,2340.0,1780.0,96.0,250.0,8.0,24.0,2.5,0.0,3186.0,10.0
25%,776.0,604.0,242.0,15.0,41.0,992.0,95.0,7320.0,3597.0,470.0,850.0,62.0,71.0,11.5,13.0,6751.0,53.0
50%,1558.0,1110.0,434.0,23.0,54.0,1707.0,353.0,9990.0,4200.0,500.0,1200.0,75.0,82.0,13.6,21.0,8377.0,65.0
75%,3624.0,2424.0,902.0,35.0,69.0,4005.0,967.0,12925.0,5050.0,600.0,1700.0,85.0,92.0,16.5,31.0,10830.0,78.0
max,48094.0,26330.0,6392.0,96.0,100.0,31643.0,21836.0,21700.0,8124.0,2340.0,6800.0,103.0,100.0,39.8,64.0,56233.0,118.0


In [6]:
df.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [7]:
# DATA WRANGLING
df.isna().all()

Private        False
Apps           False
Accept         False
Enroll         False
Top10perc      False
Top25perc      False
F.Undergrad    False
P.Undergrad    False
Outstate       False
Room.Board     False
Books          False
Personal       False
PhD            False
Terminal       False
S.F.Ratio      False
perc.alumni    False
Expend         False
Grad.Rate      False
dtype: bool

In [8]:
# SEPARATE TARGET 
target = df['Private']
X = df.drop('Private', axis = 1)
X.head()

Unnamed: 0,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [9]:
# ENCODE TARGET
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(target)
label = encoder.transform(target)
target = label

In [10]:
# SPLIT THE DATA
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, target, test_size = 0.2)

In [11]:
# INIT OUR MODEL
from sklearn.svm import SVC
model = SVC()
model.fit(x_train, y_train)

SVC()

In [12]:
# PREDICTION TIME!
y_pred = model.predict(x_test)

In [21]:
# check accuracy
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9487179487179487
[[ 36   5]
 [  3 112]]
              precision    recall  f1-score   support

           0       0.92      0.88      0.90        41
           1       0.96      0.97      0.97       115

    accuracy                           0.95       156
   macro avg       0.94      0.93      0.93       156
weighted avg       0.95      0.95      0.95       156



In [14]:
# APPLYING STANDARD SCALER
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_test)
x_test_scl = scaler.transform(x_test)

In [15]:
# PREDICTION USING SCALED DATA
y_pred2 = model.predict(x_test_scl)

In [23]:
# CHECK ACCURACY FOR SCALED DATA
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

0.26282051282051283
[[ 41   0]
 [115   0]]
              precision    recall  f1-score   support

           0       0.26      1.00      0.42        41
           1       0.00      0.00      0.00       115

    accuracy                           0.26       156
   macro avg       0.13      0.50      0.21       156
weighted avg       0.07      0.26      0.11       156



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# APPLY GRIDSEARCH FOR DIMENSION INCREASE
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0, 0.1, 0.01, 0.001]} 

In [26]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 2)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=1; total time=   0.0s
[CV] END .....................................C=0.1, gamma=0; total time=   0.0s
[CV] END .....................................C=0.1, gamma=0; total time=   0.0s
[CV] END .....................................C=0.1, gamma=0; total time=   0.0s
[CV] END .....................................C=0.1, gamma=0; total time=   0.0s
[CV] END .....................................C=0.1, gamma=0; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [1, 0, 0.1, 0.01, 0.001]},
             verbose=2)

In [27]:
# GRID PREDICTIONS
grid_pred = grid.predict(x_test)

In [31]:
# GRID ACCURACY
print(accuracy_score(y_test, grid_pred))
print(confusion_matrix(y_test, grid_pred))
print(classification_report(y_test, grid_pred))

0.7371794871794872
[[  0  41]
 [  0 115]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        41
           1       0.74      1.00      0.85       115

    accuracy                           0.74       156
   macro avg       0.37      0.50      0.42       156
weighted avg       0.54      0.74      0.63       156



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
# APPLY KERNEL TRICK
# 1. POLY
model_poly = SVC(kernel='poly')
model_poly.fit(x_train, y_train)


SVC(kernel='poly')

In [40]:
# PREDICTION TIME
y_pred_poly = model_poly.predict(x_test)

In [42]:
# ACCURACY USING POLY KERNEL TRICK
print(accuracy_score(y_test, y_pred_poly))
print(confusion_matrix(y_test, y_pred_poly))
print(classification_report(y_test, y_pred_poly))

0.9551282051282052
[[ 37   4]
 [  3 112]]
              precision    recall  f1-score   support

           0       0.93      0.90      0.91        41
           1       0.97      0.97      0.97       115

    accuracy                           0.96       156
   macro avg       0.95      0.94      0.94       156
weighted avg       0.95      0.96      0.95       156



In [49]:
# 2. RBF
model_rbf = SVC(kernel='rbf')
model_rbf.fit(x_train, y_train)

SVC()

In [51]:
# PREDICTION TIME
y_pred_rbf = model_rbf.predict(x_test)

In [52]:
# ACCURACY USING RBF
print(accuracy_score(y_test, y_pred_rbf))
print(confusion_matrix(y_test, y_pred_rbf))
print(classification_report(y_test ,y_pred_rbf))

0.9487179487179487
[[ 36   5]
 [  3 112]]
              precision    recall  f1-score   support

           0       0.92      0.88      0.90        41
           1       0.96      0.97      0.97       115

    accuracy                           0.95       156
   macro avg       0.94      0.93      0.93       156
weighted avg       0.95      0.95      0.95       156



In [53]:
# 3. SIGMOID
model_sig = SVC(kernel='sigmoid')
model_sig.fit(x_train, y_train)

SVC(kernel='sigmoid')

In [55]:
# PREDICTION TIME
y_pred_sig = model_sig.predict(x_test)

In [56]:
# ACCURACY USING SIGMOID
print(accuracy_score(y_test, y_pred_sig))
print(confusion_matrix(y_test, y_pred_sig))
print(classification_report(y_test, y_pred_sig))

0.75
[[19 22]
 [17 98]]
              precision    recall  f1-score   support

           0       0.53      0.46      0.49        41
           1       0.82      0.85      0.83       115

    accuracy                           0.75       156
   macro avg       0.67      0.66      0.66       156
weighted avg       0.74      0.75      0.74       156

