In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from collections import defaultdict

In [2]:
df = pd.read_csv('df_without_outliers.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
df_new = df.copy()

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

## Popularity Class

In [5]:
def classify_popularity(popularity):
    if popularity <= 40:
        return 'low'
    elif 40 < popularity <= 70:
        return 'medium'
    else:
        return 'high'

df_new['popularity_class'] = df_new['popularity'].apply(classify_popularity)

In [6]:
df_new['popularity_class'].value_counts()

popularity_class
low       57182
medium    29159
high       3000
Name: count, dtype: int64

In [7]:
df_new.drop(columns=['popularity'], inplace=True)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [9]:
X = df_new.drop(columns=['popularity_class']).values
y = np.array(df_new['popularity_class'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [11]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Linear SVC

In [13]:
from sklearn.svm import LinearSVC

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1,10],  # Inverse regularization strength
    'loss': ['hinge', 'squared_hinge'], # Loss function
    'tol': [ 0.01, 0.1, 1],
    'penalty':['l1', 'l2'],
    'class_weight': ['dict','balanced']}
# create a LinearSVC object
lsvc = LinearSVC(random_state=100)  # set random state for reproducibility
# perform grid search with cross-validation
grid_search = GridSearchCV(estimator=lsvc, param_grid=param_grid, cv=5)
# fit the grid search to the training data
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")


KeyboardInterrupt



In [14]:
clf = LinearSVC(random_state=42, C=1.023292992280754, class_weight= 'balanced', penalty='l2', loss = 'squared_hinge', tol=1)
clf.fit(X_train, y_train)



In [15]:
y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.6371301719956721
F1-score [0.28258488 0.75774297 0.46657639]
              precision    recall  f1-score   support

        high       0.19      0.57      0.28       900
         low       0.77      0.74      0.76     17155
      medium       0.51      0.43      0.47      8748

    accuracy                           0.64     26803
   macro avg       0.49      0.58      0.50     26803
weighted avg       0.67      0.64      0.65     26803



In [None]:
from sklearn.calibration import CalibratedClassifierCV

# create the calibrated classifier
clf_calibrated = CalibratedClassifierCV(clf, cv=5, method='sigmoid')

clf_calibrated.fit(X_train, y_train)
y_proba = clf_calibrated.predict_proba(X_test)

# calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
print("ROC AUC Score (Calibrated):", roc_auc)



ROC AUC Score (Calibrated): 0.7938729759868123




### Nonlinear SVM

In [18]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [88]:
param_grid_svc = {
  'C': [ 0.01, 0.1, 1],  # Wider search range
  'gamma': ['scale'],
  'tol': [0.01, 0.1, 1] 
}

In [None]:
# create a SVC object
svc = SVC(random_state=10)  # set random state for reproducibility

# do grid search find the best parameters
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid_svc, cv=5)

# fit the result of the grid search
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [19]:
# kernel = sigmoid
clf_sigmoid = SVC(random_state=42, C=1.02, class_weight= 'balanced', tol=1, gamma = 'scale', kernel = 'sigmoid' )
clf_sigmoid.fit(X_train, y_train)

In [20]:
# kernel = sigmoid
y_pred_sigmoid = clf_sigmoid.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred_sigmoid))
print('F1-score %s' % f1_score(y_test, y_pred_sigmoid, average=None))
print(classification_report(y_test, y_pred_sigmoid))

Accuracy 0.4440174607320076
F1-score [0.16590617 0.5796314  0.37012513]
              precision    recall  f1-score   support

        high       0.09      0.79      0.17       900
         low       0.78      0.46      0.58     17155
      medium       0.36      0.38      0.37      8748

    accuracy                           0.44     26803
   macro avg       0.41      0.54      0.37     26803
weighted avg       0.62      0.44      0.50     26803



In [None]:
# kernel = sigmoid

clf_calibrated_sigmoid = CalibratedClassifierCV(clf_sigmoid, cv=5)
clf_calibrated_sigmoid.fit(X_train, y_train)

y_proba_sigmoid = clf_calibrated_sigmoid.predict_proba(X_test)

# calculate ROC AUC score
roc_auc_sigmoid = roc_auc_score(y_test, y_proba_sigmoid, multi_class="ovr", average="macro")
print("ROC AUC Score (Calibrated):", roc_auc_sigmoid)

ROC AUC Score (Calibrated): 0.7229663387183809


In [22]:
# kernel = poly
clf_poly = SVC(random_state=42, C=1.02, class_weight= 'balanced', tol=1, gamma = 'scale', kernel = 'poly' )
clf_poly.fit(X_train, y_train)

In [102]:
# kernel = poly
y_pred_poly = clf_poly.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred_poly))
print('F1-score %s' % f1_score(y_test, y_pred_poly, average=None))
print(classification_report(y_test, y_pred_poly))

Accuracy 0.6582472111330822
F1-score [0.29287911 0.76999773 0.55506274]
              precision    recall  f1-score   support

        high       0.19      0.69      0.29       900
         low       0.87      0.69      0.77     17155
      medium       0.53      0.59      0.56      8748

    accuracy                           0.66     26803
   macro avg       0.53      0.66      0.54     26803
weighted avg       0.73      0.66      0.68     26803



In [25]:
# kernel = poly
clf_calibrated_poly = CalibratedClassifierCV(clf_poly, cv=5)
clf_calibrated_poly.fit(X_train, y_train)

# Now you can use predict_proba on the calibrated classifier
y_proba_poly = clf_calibrated_poly.predict_proba(X_test)

# Calculate ROC AUC score
roc_auc_poly = roc_auc_score(y_test, y_proba_poly, multi_class="ovr", average="macro")
print("ROC AUC Score (Calibrated):", roc_auc_poly)

ROC AUC Score (Calibrated): 0.824111551864235


In [24]:
# kernel = rbf
clf_rbf = SVC(random_state=42, C=1.02, class_weight= 'balanced', tol=1, gamma = 'scale', kernel = 'rbf' )
clf_rbf.fit(X_train, y_train)

In [104]:
# kernel = rbf
y_pred_rbf = clf_rbf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred_rbf))
print('F1-score %s' % f1_score(y_test, y_pred_rbf, average=None))
print(classification_report(y_test, y_pred_rbf))

Accuracy 0.6795134872961982
F1-score [0.32233742 0.78619329 0.58821666]
              precision    recall  f1-score   support

        high       0.20      0.76      0.32       900
         low       0.90      0.70      0.79     17155
      medium       0.55      0.64      0.59      8748

    accuracy                           0.68     26803
   macro avg       0.55      0.70      0.57     26803
weighted avg       0.76      0.68      0.71     26803



In [None]:
# kernel = rbf
clf_calibrated_rbf = CalibratedClassifierCV(clf_rbf, cv=5)
clf_calibrated_rbf.fit(X_train, y_train)

y_proba_rbf = clf_calibrated_rbf.predict_proba(X_test)

# calculate ROC AUC score
roc_auc_rbf = roc_auc_score(y_test, y_proba_rbf, multi_class="ovr", average="macro")
print("ROC AUC Score (Calibrated):", roc_auc_rbf)

ROC AUC Score (Calibrated): 0.8518570154050243


In [28]:
# kernel = rbf, gamma = auto
clf_rbf_auto = SVC(random_state=42, C=2, class_weight= 'balanced', tol=1, gamma = 'auto', kernel = 'rbf' )
clf_rbf_auto.fit(X_train, y_train)

In [110]:
# kernel = rbf, gamma = auto
y_pred_rbf_auto = clf_rbf_auto.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred_rbf_auto))
print('F1-score %s' % f1_score(y_test, y_pred_rbf_auto, average=None))
print(classification_report(y_test, y_pred_rbf_auto))

Accuracy 0.6934671491997164
F1-score [0.33661315 0.79626265 0.59883721]
              precision    recall  f1-score   support

        high       0.22      0.72      0.34       900
         low       0.90      0.72      0.80     17155
      medium       0.56      0.65      0.60      8748

    accuracy                           0.69     26803
   macro avg       0.56      0.70      0.58     26803
weighted avg       0.76      0.69      0.72     26803



In [29]:
# kernel = rbf
clf_calibrated_rbf_auto = CalibratedClassifierCV(clf_rbf_auto, cv=5)
clf_calibrated_rbf_auto.fit(X_train, y_train)

# Now you can use predict_proba on the calibrated classifier
y_proba_rbf_auto = clf_calibrated_rbf_auto.predict_proba(X_test)

# Calculate ROC AUC score
roc_auc_rbf_auto = roc_auc_score(y_test, y_proba_rbf_auto, multi_class="ovr", average="macro")
print("ROC AUC Score (Calibrated):", roc_auc_rbf_auto)

ROC AUC Score (Calibrated): 0.8537105000336004


# Mode Class

In [30]:
df_new_2= df.copy()

In [31]:
df_new_2['mode'].value_counts()

mode
1    56906
0    32435
Name: count, dtype: int64

In [32]:
X_mode = df_new_2.drop(columns=['mode']).values
y_mode = np.array(df_new_2['mode'])

In [33]:
X_train_mode, X_test_mode, y_train_mode, y_test_mode = train_test_split(
    X_mode, y_mode, test_size=0.3, random_state=100)

In [34]:
scl = StandardScaler()
X_train_mode = scl.fit_transform(X_train_mode)
X_test_mode = scl.transform(X_test_mode)

## Linear SVM

In [None]:
from sklearn.svm import LinearSVC

param_grid = {
  'C': [0.01, 0.1, 1, 10],  # Regularization parameter (higher C = lower regularization)
  'loss': ['hinge', 'squared_hinge'],  # Loss functions for linear SVC
  'tol': [0.001, 0.01, 0.1],  # Tolerance for stopping optimization (smaller = stricter)
  'penalty': ['l1', 'l2'],  # Type of regularization (L1 or L2)
  'class_weight': ['dict', 'balanced']  # Class weights for imbalanced datasets
}

# create a LinearSVC object
lsvc = LinearSVC(random_state=100)  # Set random state for reproducibility

# do grid search to find the best parameters
grid_search = GridSearchCV(estimator=lsvc, param_grid=param_grid, cv=5)
grid_search.fit(X_train_mode, y_train_mode)

# get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

360 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Anaconda3\Lib\site-packages\sklearn\svm\_classes.py", line 261, in fit
    self._validate_params()
  File "E:\Anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "E:\Anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidP

Best Parameters: {'C': 10, 'class_weight': 'balanced', 'loss': 'squared_hinge', 'penalty': 'l2', 'tol': 0.001}
Best Score: 0.6034252107594931




In [35]:
clf_mode = LinearSVC(random_state=42, C=10, class_weight= 'balanced', loss = 'squared_hinge', penalty = 'l2', tol = 0.001)
clf_mode.fit(X_train_mode, y_train_mode)



In [134]:
y_pred_mode = clf_mode.predict(X_test_mode)

print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_mode, average=None))
print(classification_report(y_test_mode, y_pred_mode))

Accuracy 0.592657538335261
F1-score [0.48572774 0.66277489]
              precision    recall  f1-score   support

           0       0.45      0.53      0.49      9745
           1       0.70      0.63      0.66     17058

    accuracy                           0.59     26803
   macro avg       0.57      0.58      0.57     26803
weighted avg       0.61      0.59      0.60     26803



In [None]:
clf_mode_calibrated = CalibratedClassifierCV(clf_mode)
clf_mode_calibrated.fit(X_train_mode, y_train_mode)
y_pred_proba_mode = clf_mode_calibrated.predict_proba(X_test_mode)[:, 1]

# calculate ROC AUC score
roc_value_mode = roc_auc_score(y_test_mode, y_pred_proba_mode)
print('ROC AUC:', roc_value_mode)



ROC AUC: 0.6418028106924729




## Non Linear SVM

In [135]:
param_grid_svc = {
  'C': [ 0.01, 0.1, 1],  # Wider search range
  'gamma': ['scale'],
  'tol': [0.01, 0.1, 1] 
}

In [None]:
# create a SVC object
svc = SVC(random_state=10)  # set random state for reproducibility

# do grid search to find the best parameters
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid_svc, cv=5)

# fit with the best parameters
grid_search.fit(X_train_mode, y_train_mode)

In [None]:
# get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'C': 1, 'gamma': 'scale', 'tol': 1}
Best Score: 0.6691292697004292


In [39]:
# Kernel = sigmoid
clf_mode_sigmod = SVC(random_state=42, C=1, tol = 1, gamma = 'scale', class_weight= 'balanced', kernel = 'sigmoid' )
clf_mode_sigmod.fit(X_train_mode, y_train_mode)

In [141]:
# kernel = sigmoid
y_pred_mode_sigmoid = clf_mode_sigmod.predict(X_test_mode)

print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_mode_sigmoid))
print('F1-score %s' % f1_score(y_test_mode, y_pred_mode_sigmoid, average=None))
print(classification_report(y_test_mode, y_pred_mode_sigmoid))

Accuracy 0.5377383128754244
F1-score [0.45681719 0.59767502]
              precision    recall  f1-score   support

           0       0.40      0.53      0.46      9745
           1       0.67      0.54      0.60     17058

    accuracy                           0.54     26803
   macro avg       0.53      0.54      0.53     26803
weighted avg       0.57      0.54      0.55     26803



In [40]:
clf_mode_calibrated_sigmoid = CalibratedClassifierCV(clf_mode_sigmod)
clf_mode_calibrated_sigmoid.fit(X_train_mode, y_train_mode)
y_pred_proba_mode_sigmoid = clf_mode_calibrated_sigmoid.predict_proba(X_test_mode)[:, 1]

# Calculate ROC AUC score
roc_value_mode_sigmoid = roc_auc_score(y_test_mode, y_pred_proba_mode_sigmoid)
print('ROC AUC:', roc_value_mode_sigmoid)

ROC AUC: 0.5531414476345786


In [41]:
# Kernel = poly
clf_mode_poly = SVC(random_state=42, C=1, tol = 1, gamma = 'scale', class_weight= 'balanced', kernel = 'poly' )
clf_mode_poly.fit(X_train_mode, y_train_mode)

In [143]:
# kernel = poly
y_pred_mode_poly = clf_mode_poly.predict(X_test_mode)

print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_mode_poly))
print('F1-score %s' % f1_score(y_test_mode, y_pred_mode_poly, average=None))
print(classification_report(y_test_mode, y_pred_mode_poly))

Accuracy 0.6325784427116368
F1-score [0.54684336 0.69103344]
              precision    recall  f1-score   support

           0       0.50      0.61      0.55      9745
           1       0.74      0.65      0.69     17058

    accuracy                           0.63     26803
   macro avg       0.62      0.63      0.62     26803
weighted avg       0.65      0.63      0.64     26803



In [None]:
clf_mode_calibrated_poly = CalibratedClassifierCV(clf_mode_poly)
clf_mode_calibrated_poly.fit(X_train_mode, y_train_mode)
y_pred_proba_mode_poly = clf_mode_calibrated_poly.predict_proba(X_test_mode)[:, 1]

# calculate ROC AUC score
roc_value_mode_poly = roc_auc_score(y_test_mode, y_pred_proba_mode_poly)
print('ROC AUC:', roc_value_mode_poly)

ROC AUC: 0.6794010126077564


In [43]:
# Kernel = rbf
clf_mode_rbf = SVC(random_state=42, C=1, tol = 1, gamma = 'scale', kernel = 'rbf' )
clf_mode_rbf.fit(X_train_mode, y_train_mode)

In [147]:
# kernel = rbf
y_pred_mode_rbf = clf_mode_rbf.predict(X_test_mode)

print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_mode_rbf))
print('F1-score %s' % f1_score(y_test_mode, y_pred_mode_rbf, average=None))
print(classification_report(y_test_mode, y_pred_mode_rbf))

Accuracy 0.6718277804723352
F1-score [0.3501773  0.78048415]
              precision    recall  f1-score   support

           0       0.63      0.24      0.35      9745
           1       0.68      0.92      0.78     17058

    accuracy                           0.67     26803
   macro avg       0.65      0.58      0.57     26803
weighted avg       0.66      0.67      0.62     26803



In [44]:
clf_mode_calibrated_rbf = CalibratedClassifierCV(clf_mode_rbf)
clf_mode_calibrated_rbf.fit(X_train_mode, y_train_mode)
y_pred_proba_mode_rbf = clf_mode_calibrated_rbf.predict_proba(X_test_mode)[:, 1]

# Calculate ROC AUC score
roc_value_mode_rbf = roc_auc_score(y_test_mode, y_pred_proba_mode_rbf)
print('ROC AUC:', roc_value_mode_rbf)

ROC AUC: 0.680843548233501
