In [5]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score,confusion_matrix,classification_report

import time
import pandas as pd
from sklearn.model_selection import GridSearchCV
import numpy as np
from stringkernels.kernels import polynomial_string_kernel
from stringkernels.kernels import string_kernel
import scipy
import random
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../Data/CleanedEcommerce.csv')

In [8]:
df = df.dropna()

In [3]:
text = 'desc'
label = 'label'
num_classes = df[label].nunique()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [9]:
label_encoder = LabelEncoder()

df[label] = label_encoder.fit_transform(df[label])

In [10]:
X= tfidf.fit_transform(df[text])
y= df[label].values

In [11]:
X.shape

(27801, 5000)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.4, random_state=42)




In [13]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [14]:

import time
from sklearn import svm
from sklearn.metrics import accuracy_score

start = time.time()
svm_classifier = svm.SVC(kernel='rbf')
svm_classifier.fit(X_resampled, y_resampled)
end = time.time()

print("The time of execution of the above program is:", (end - start)/60, "Mins")



y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average = 'weighted')
class_report = classification_report(y_test, y_pred)
print("F1 Score:", f1)
print("Accuracy:", accuracy)
print("Classification Report :/n", class_report)


The time of execution of the above program is: 2.78784849246343 Mins
F1 Score: 0.9590981003067038
Accuracy: 0.9591011235955056
Classification Report :/n               precision    recall  f1-score   support

           0       0.96      0.94      0.95       489
           1       0.99      0.98      0.99       449
           2       0.94      0.94      0.94       410
           3       0.95      0.97      0.96       877

    accuracy                           0.96      2225
   macro avg       0.96      0.96      0.96      2225
weighted avg       0.96      0.96      0.96      2225



# HyperParameter Tuning Using HyperOpt

In [15]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC



In [16]:
# Define the hyperparameter search space
space = {
    'C': hp.loguniform('C', np.log(0.001), np.log(10)),  # Regularization parameter
    'kernel': hp.choice('kernel', ['rbf', 'linear','sigmoid']),
    'gamma': hp.loguniform('gamma', np.log(0.001), np.log(10))
}

In [17]:
# Define the objective function to maximize (e.g., cross-validated accuracy)
start = time.time()
def objective(params):
    # Define hyperparameters
    C = params['C']
    kernel = params['kernel']
    gamma = params['gamma']
    
    # Create an SVM classifier with the specified hyperparameters
    clf = SVC(C=C, kernel=kernel, gamma=gamma, random_state=42)
    
    # Train the classifier
    clf.fit(X_resampled, y_resampled)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_val)
    
    # Calculate F1 score
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    if f1<=0.95:
        return {'loss': -f1, 'status': STATUS_OK}
    else:
        return {'loss': 1.00, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10,  # Number of optimization iterations
            trials=trials,
            rstate=np.random.default_rng(42))

end = time.time()
print("The time of execution of above program is :",
      (end-start)/60, "Mins")
# Get the best hyperparameters and the corresponding loss
best_hyperparams = space_eval(space, best)
best_loss = -trials.best_trial['result']['loss']

# Print the best hyperparameters and corresponding loss
print("Best Hyperparameters:")
print(best_hyperparams)
print("Best Weighted F1 Score (Loss):", best_loss)

100%|███████| 10/10 [53:00<00:00, 318.04s/trial, best loss: -0.9495629481636366]
The time of execution of above program is : 53.006715297698975 Mins
Best Hyperparameters:
{'C': 3.4332584837247335, 'gamma': 0.3023263429485403, 'kernel': 'rbf'}
Best Weighted F1 Score (Loss): 0.9495629481636366


In [18]:
start = time.time()
svm_classifier = svm.SVC(
    kernel='rbf',
    gamma =  0.3023263429485403, 
    C = 3.4332584837247335
)
svm_classifier.fit(X_resampled, y_resampled)
end = time.time()
print("The time of execution of model is :",
      (end-start) * (10**3/1000)/60, "mins")
y_pred = svm_classifier.predict(X_test)
f1 = f1_score(y_test, y_pred, average = 'weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(f'F1 Score: {f1:.4f}')
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

The time of execution of model is : 1.0421004891395569 mins
Accuracy: 0.9591011235955056
F1 Score: 0.9591
Confusion Matrix:
 [[461   1   9  18]
 [  3 441   1   4]
 [  5   1 385  19]
 [ 12   5  13 847]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       489
           1       0.98      0.98      0.98       449
           2       0.94      0.94      0.94       410
           3       0.95      0.97      0.96       877

    accuracy                           0.96      2225
   macro avg       0.96      0.96      0.96      2225
weighted avg       0.96      0.96      0.96      2225

