In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from stringkernels.kernels import polynomial_string_kernel
from stringkernels.kernels import string_kernel
import scipy


In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=5000)

In [4]:
x= cv.fit_transform(df['transformed text']).toarray()
y= df['subject'].values

In [5]:
x.shape

(43125, 5000)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# String Kernel

In [7]:
def edit_distance(str1, str2):
    m, n = len(str1), len(str2)
    
    # Create a matrix to store the edit distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Initialize the matrix
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Calculate edit distance using dynamic programming
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                cost = 0
            else:
                cost = 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,  # Deletion
                dp[i][j - 1] + 1,  # Insertion
                dp[i - 1][j - 1] + cost  # Substitution
            )
    
    return dp[m][n]


In [8]:
def string_kernel(X1, X2):
    kernel_matrix = np.zeros((X1.shape[0], X2.shape[0]))
    for i in range(X1.shape[0]):
        for j in range(X2.shape[0]):
            kernel_matrix[i, j] = edit_distance(X1[i], X2[j])
    return kernel_matrix

In [None]:

start = time.time()
svm_classifier = svm.SVC(kernel=string_kernel)
svm_classifier.fit(x_train, y_train)
end = time.time()
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")
y_pred = svm_classifier.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# GridSearchCV

In [6]:
model_params= {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [0.2],
            'kernel': [string_kernel]
        }
    }
}

In [None]:
scores= []
start = time.time()
for model_name, mp in model_params.items():
    clf= GridSearchCV(mp['model'], mp['params'], cv=2, verbose = 4, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    

end = time.time()
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")

Fitting 2 folds for each of 2 candidates, totalling 4 fits


# HyperParameter Tuning Using HyperOpt

In [8]:
from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score

In [9]:
# Define the hyperparameter search space
space = {
    'C': hp.loguniform('C', np.log(0.001), np.log(10)),  # Regularization parameter
    'kernel': hp.choice('kernel', ['rbf']),
    'gamma': hp.loguniform('gamma', np.log(0.001), np.log(10))
}

In [10]:
# Define the objective function to maximize (e.g., cross-validated accuracy)
start = time.time()
def objective(params):
    clf = svm.SVC(**params, decision_function_shape='ovo')  # Multi-class SVM using one-vs-one (ovr)
    score = cross_val_score(clf, x_train, y_train, cv=2, scoring='accuracy', verbose =4).mean()
    return -score  # Hyperopt minimizes, so we negate the score

# Use Hyperopt to search for the best hyperparameters
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=5)
end = time.time()
print("The time of execution of above program is :",
      (end-start) * (10**3/1000)/60, "mins")
print("Best Hyperparameters:")
print(best)

[CV] END ................................ score: (test=0.723) total time=16.3min
[CV] END ................................ score: (test=0.723) total time=17.0min
[CV] END ................................ score: (test=0.408) total time=28.3min
[CV] END ................................ score: (test=0.408) total time=27.1min
[CV] END ................................ score: (test=0.408) total time=33.8min
[CV] END ................................ score: (test=0.408) total time=34.0min
[CV] END ................................ score: (test=0.370) total time=57.7min
[CV] END ................................ score: (test=0.367) total time=56.7min
[CV] END ................................ score: (test=0.408) total time=34.2min
[CV] END ................................ score: (test=0.408) total time=35.4min
100%|██████| 5/5 [5:40:30<00:00, 4086.17s/trial, best loss: -0.7229565217391305]
The time of execution of above program is : 340.51402212778726 mins
Best Hyperparameters:
{'C': 0.09624152050

In [12]:
start = time.time()
svm_classifier = svm.SVC(kernel='rbf',gamma = 0.0016902116206715884, C = 0.09624152050519157)
svm_classifier.fit(x_train, y_train)
end = time.time()
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")
y_pred = svm_classifier.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

The time of execution of above program is : 1632622.5192546844 ms
Accuracy: 0.7479420289855072
