In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score,confusion_matrix,classification_report

import time
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from stringkernels.kernels import polynomial_string_kernel
from stringkernels.kernels import string_kernel
import scipy
import random

In [2]:
df = pd.read_csv('../Data/CleanedData.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [4]:
X= tfidf.fit_transform(df['transformed text']).toarray()
y= df['subject'].values

In [5]:
X.shape

(43125, 5000)

In [6]:
from sklearn.preprocessing import LabelEncoder

# Example: Assuming 'y_train' contains string labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Then, you can use 'y_train_encoded' as labels in your classifier.


In [7]:
X.shape

(43125, 5000)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:

import time
from sklearn import svm
from sklearn.metrics import accuracy_score

# Assuming you have already defined your string_kernel function
# and loaded your data into x_train and y_train

start = time.time()
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
end = time.time()

print("The time of execution of the above program is:", (end - start) * 1000, "ms")

# Print the support vectors and their coefficients
support_vectors = svm_classifier.support_vectors_
dual_coefficients = svm_classifier.dual_coef_

print("Number of support vectors:", len(support_vectors))

for i, (sv, coef) in enumerate(zip(support_vectors, dual_coefficients[0])):
    print(f"Support Vector {i + 1}:")
    print("Vector:", sv)
    print("Coefficient (Weight):", coef)

y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average = 'weighted')
print("F1 Score:", f1)
print("Accuracy:", accuracy)


The time of execution of the above program is: 971044.6088314056 ms
Number of support vectors: 17982
Support Vector 1:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.0
Support Vector 2:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.48558464376689664
Support Vector 3:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 1.0
Support Vector 4:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.0
Support Vector 5:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 1.0
Support Vector 6:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.8464757935208291
Support Vector 7:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 1.0
Support Vector 8:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.0
Support Vector 9:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.32385693959454137
Support Vector 10:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.0
Support Vector 11:
Vector: [0. 0. 0. ... 0. 0. 0.]
Coefficient (Weight): 0.0
Support Vector

# With Over Sampled Data

In [9]:

import time
from sklearn import svm
from sklearn.metrics import accuracy_score

# Assuming you have already defined your string_kernel function
# and loaded your data into x_train and y_train

start = time.time()
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(X_resampled, y_resampled)
end = time.time()

print("The time of execution of the above program is:", (end - start) * 1000, "ms")

# Print the support vectors and their coefficients
support_vectors = svm_classifier.support_vectors_
dual_coefficients = svm_classifier.dual_coef_

print("Number of support vectors:", len(support_vectors))

for i, (sv, coef) in enumerate(zip(support_vectors, dual_coefficients[0])):
    print(f"Support Vector {i + 1}:")
    print("Vector:", sv)
    print("Coefficient (Weight):", coef)

y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average = 'weighted')
print("F1 Score of Oversampled Data:", f1)
print("Accuracy:", accuracy)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



F1 Score of Oversampled Data: 0.7643872000230547
Accuracy: 0.7414492753623189


# GridSearchCV

In [6]:
model_params= {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [0.2],
            'kernel': [string_kernel]
        }
    }
}

In [None]:
scores= []
start = time.time()
for model_name, mp in model_params.items():
    clf= GridSearchCV(mp['model'], mp['params'], cv=2, verbose = 4, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    

end = time.time()
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")

Fitting 2 folds for each of 2 candidates, totalling 4 fits


# HyperParameter Tuning Using HyperOpt

In [10]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC



In [11]:
# Define the hyperparameter search space
space = {
    'C': hp.loguniform('C', np.log(0.001), np.log(10)),  # Regularization parameter
    'kernel': hp.choice('kernel', ['rbf', 'linear','sigmoid']),
    'gamma': hp.loguniform('gamma', np.log(0.001), np.log(10))
}

In [None]:
# Define the objective function to maximize (e.g., cross-validated accuracy)
start = time.time()
def objective(params):
    # Define hyperparameters
    C = params['C']
    kernel = params['kernel']
    gamma = params['gamma']
    
    # Create an SVM classifier with the specified hyperparameters
    clf = SVC(C=C, kernel=kernel, gamma=gamma, random_state=42)
    
    # Train the classifier
    clf.fit(X_resampled, y_resampled)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return {'loss': -f1, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,  # Number of optimization iterations
            trials=trials,
            rstate=np.random.default_rng(42))

end = time.time()
print("The time of execution of above program is :",
      (end-start) * 10**3, "ms")
# Get the best hyperparameters and the corresponding loss
best_hyperparams = space_eval(space, best)
best_loss = -trials.best_trial['result']['loss']

# Print the best hyperparameters and corresponding loss
print("Best Hyperparameters:")
print(best_hyperparams)
print("Best Weighted F1 Score (Loss):", best_loss)

  4%|  | 2/50 [2:41:59<63:07:47, 4734.74s/trial, best loss: -0.7664095764981206]

In [12]:
start = time.time()
svm_classifier = svm.SVC(kernel='linear',gamma = 0.002784873298856794, C = 0.5666503019561641)
svm_classifier.fit(X_resampled, y_resampled)
end = time.time()
print("The time of execution of model is :",
      (end-start) * (10**3/1000)/60, "mins")
y_pred = svm_classifier.predict(X_test)
f1 = f1_score(y_test, y_pred, average = 'weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(f'F1 Score: {f1:.4f}')
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

The time of execution of model is : 68.89560632308324 mins
Accuracy: 0.7426086956521739
F1 Score: 0.7661
Confusion Matrix:
 [[ 111   15    1  120   36    4]
 [  45 1670    4  112   44    1]
 [   7    7  143    7    1    0]
 [ 197   72    1  488   70    7]
 [ 338  105    2  776 2105  149]
 [   5    1    1    5   87 1888]]
Classification Report:
               precision    recall  f1-score   support

           0       0.16      0.39      0.22       287
           1       0.89      0.89      0.89      1876
           2       0.94      0.87      0.90       165
           3       0.32      0.58      0.42       835
           4       0.90      0.61      0.72      3475
           5       0.92      0.95      0.94      1987

    accuracy                           0.74      8625
   macro avg       0.69      0.71      0.68      8625
weighted avg       0.82      0.74      0.77      8625



# Using Word2Vec as Vectoriser

In [3]:
df = pd.read_csv('word2vecData.csv')

In [4]:
import ast

# Convert the string representation to a list
df['vec'] = df['vec'].apply(ast.literal_eval)

# Verify the data type of the column
print(type(df['vec'].iloc[0]))

<class 'list'>


In [5]:
X = df['vec'].to_list()
y = df['subject'].to_list()

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report

svm_classifier = SVC(kernel='linear',gamma = 0.002784873298856794, C = 0.5666503019561641)
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Calculate weighted F1 score
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print("Weighted F1 Score:", weighted_f1)

Weighted F1 Score: 0.6627216557612167
