In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
df = pd.read_csv('df_without_outliers.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
df_new = df.copy()

In [5]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

## Popularity Class

## Random Forest

In [6]:
def classify_popularity(popularity):
    if popularity <= 40:
        return 'low'
    elif 40 < popularity <= 70:
        return 'medium'
    else:
        return 'high'

df_new['popularity_class'] = df_new['popularity'].apply(classify_popularity)

In [7]:
df_new.drop(columns=['popularity'], inplace=True)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [9]:
X = df_new.drop(columns=['popularity_class']).values
y = np.array(df_new['popularity_class'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [11]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# classifier definition (assuming RandomForestClassifier)
clf = RandomForestClassifier(random_state=0)

# define parameters for grid search
param_grid = {
  'n_estimators': [10, 100, 200],
  'criterion': ["gini", "entropy", "log_loss"],
}

# do grid search to find the best parameters
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)

In [None]:
grid_search.fit(X_train, y_train)

# get the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'criterion': 'entropy', 'n_estimators': 200}
Best Score: 0.7885286765794269


In [16]:
clf = RandomForestClassifier(random_state=42, n_estimators=200, criterion= 'entropy', max_depth = 30, min_samples_split = 2, min_samples_leaf = 1)
clf.fit(X_train, y_train)

In [17]:
y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.7915904936014625
F1-score [0.22160149 0.86269371 0.68349782]
              precision    recall  f1-score   support

        high       0.68      0.13      0.22       900
         low       0.84      0.88      0.86     17155
      medium       0.69      0.68      0.68      8748

    accuracy                           0.79     26803
   macro avg       0.74      0.57      0.59     26803
weighted avg       0.79      0.79      0.78     26803



In [18]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, clf.predict_proba(X_test), multi_class="ovr", average="macro"))

0.9012329564748885


## Bagging

In [19]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [21]:
clf_bagging = BaggingClassifier(estimator=None, n_estimators=200, random_state=0)
clf_bagging.fit(X_train, y_train)

y_pred_bagging = clf_bagging.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred_bagging))
print('F1-score %s' % f1_score(y_test, y_pred_bagging, average=None))
print(classification_report(y_test, y_pred_bagging))

Accuracy 0.8028951982986979
F1-score [0.29259897 0.87066697 0.70125858]
              precision    recall  f1-score   support

        high       0.65      0.19      0.29       900
         low       0.85      0.89      0.87     17155
      medium       0.70      0.70      0.70      8748

    accuracy                           0.80     26803
   macro avg       0.74      0.59      0.62     26803
weighted avg       0.80      0.80      0.80     26803



In [22]:
print(roc_auc_score(y_test, clf_bagging.predict_proba(X_test), multi_class="ovr", average="macro"))

0.9081588067104706


In [23]:
# Bagging - SVC
clf_bagging_svc = BaggingClassifier(estimator=SVC(C=1, tol=1), n_estimators=100, random_state=0)
clf_bagging_svc.fit(X_train, y_train)

In [24]:
# Bagging - SVC
y_pred_bagging_svc = clf_bagging_svc.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred_bagging_svc))
print('F1-score %s' % f1_score(y_test, y_pred_bagging_svc, average=None))
print(classification_report(y_test, y_pred_bagging_svc))

Accuracy 0.7619296347423796
F1-score [0.15774099 0.83841736 0.64870593]
              precision    recall  f1-score   support

        high       0.64      0.09      0.16       900
         low       0.83      0.85      0.84     17155
      medium       0.63      0.66      0.65      8748

    accuracy                           0.76     26803
   macro avg       0.70      0.53      0.55     26803
weighted avg       0.76      0.76      0.75     26803



In [29]:
print(roc_auc_score(y_test, clf_bagging_svc.predict_proba(X_test), multi_class="ovr", average="macro"))

0.743601300430802


In [25]:
# Bagging - Random Forest
clf_bagging_randomforest = BaggingClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100)
clf_bagging_randomforest.fit(X_train, y_train)

In [26]:
# Bagging - Random Forest
y_pred = clf_bagging_randomforest.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.7871133828302802
F1-score [0.1702544  0.85987424 0.67660721]
              precision    recall  f1-score   support

        high       0.71      0.10      0.17       900
         low       0.84      0.88      0.86     17155
      medium       0.68      0.67      0.68      8748

    accuracy                           0.79     26803
   macro avg       0.74      0.55      0.57     26803
weighted avg       0.78      0.79      0.78     26803



In [30]:
print(roc_auc_score(y_test, clf_bagging_randomforest.predict_proba(X_test), multi_class="ovr", average="macro"))

0.89766789684021


## AdaBoosting

In [13]:
from sklearn.ensemble import AdaBoostClassifier

In [32]:
param_grid = {'n_estimators': [100, 200],  # Number of boosting stages
    'learning_rate': [0.1, 0.5, 1.0]  # Learning rate for each boosting stage
}

In [None]:
# create the AdaBoost classifier with a decision tree base estimator by default
ada_boost = AdaBoostClassifier()

# do se GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=ada_boost, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)



In [None]:
# get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")

Best Parameters: {'learning_rate': 1.0, 'n_estimators': 200}


In [35]:
clf_adaboost = AdaBoostClassifier(n_estimators=200,learning_rate=1.0)
clf_adaboost.fit(X_train, y_train)



In [36]:
# Adaboost
y_pred_adaboost = clf_adaboost.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred_adaboost))
print('F1-score %s' % f1_score(y_test, y_pred_adaboost, average=None))
print(classification_report(y_test, y_pred_adaboost))

Accuracy 0.7399171734507332
F1-score [0.24075666 0.8194371  0.62439554]
              precision    recall  f1-score   support

        high       0.53      0.16      0.24       900
         low       0.83      0.81      0.82     17155
      medium       0.60      0.66      0.62      8748

    accuracy                           0.74     26803
   macro avg       0.65      0.54      0.56     26803
weighted avg       0.74      0.74      0.74     26803



In [37]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, clf_adaboost.predict_proba(X_test), multi_class="ovr", average="macro"))

0.7040600696070743


In [14]:
# Adaboost - Random Forest
clf_adaboost_randomforest = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0)
clf_adaboost_randomforest.fit(X_train, y_train)



In [15]:
y_pred_adaboost_randomforest = clf_adaboost_randomforest.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred_adaboost_randomforest))
print('F1-score %s' % f1_score(y_test, y_pred_adaboost_randomforest, average=None))
print(classification_report(y_test, y_pred_adaboost_randomforest))

Accuracy 0.7786068723650338
F1-score [0.13009709 0.8552573  0.65350456]
              precision    recall  f1-score   support

        high       0.52      0.07      0.13       900
         low       0.82      0.90      0.86     17155
      medium       0.69      0.62      0.65      8748

    accuracy                           0.78     26803
   macro avg       0.67      0.53      0.55     26803
weighted avg       0.77      0.78      0.77     26803



In [16]:
print(roc_auc_score(y_test, clf_adaboost_randomforest.predict_proba(X_test), multi_class="ovr", average="macro"))

0.8875571724661396


## Mode Class

## Random Forest

In [43]:
df_new_2= df.copy()

In [44]:
df_new_2['mode'].value_counts()

mode
1    56906
0    32435
Name: count, dtype: int64

In [45]:
X_mode = df_new_2.drop(columns=['mode']).values
y_mode = np.array(df_new_2['mode'])

In [46]:
X_train_mode, X_test_mode, y_train_mode, y_test_mode = train_test_split(
    X_mode, y_mode, test_size=0.3, random_state=100)

In [47]:
scl = StandardScaler()
X_train_mode = scl.fit_transform(X_train_mode)
X_test_mode = scl.transform(X_test_mode)

In [None]:
# classifier definition (assuming RandomForestClassifier)
clf_mode = RandomForestClassifier(random_state=0)

# define the parameter for grid search
param_grid = {
  'n_estimators': [10, 100, 200],
  'criterion': ["gini", "entropy", "log_loss"],
}

# Usdoing GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=clf_mode, param_grid=param_grid, cv=5)

In [None]:
grid_search.fit(X_train_mode, y_train_mode)

# get best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'criterion': 'entropy', 'n_estimators': 200}
Best Score: 0.7333940808944881


In [50]:
clf_mode = RandomForestClassifier(random_state=42, n_estimators=200, criterion= 'entropy', max_depth = 30, min_samples_split = 2, min_samples_leaf = 1)
clf_mode.fit(X_train_mode, y_train_mode)

In [51]:
y_pred_mode = clf_mode.predict(X_test_mode)

print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_mode, average=None))
print(classification_report(y_test_mode, y_pred_mode))

Accuracy 0.7370816699623176
F1-score [0.54707886 0.81478172]
              precision    recall  f1-score   support

           0       0.73      0.44      0.55      9745
           1       0.74      0.91      0.81     17058

    accuracy                           0.74     26803
   macro avg       0.74      0.67      0.68     26803
weighted avg       0.74      0.74      0.72     26803



In [52]:
y_pred_proba_randomforest = clf_mode.predict_proba(X_test_mode)[:, 1]

# Calculate ROC AUC score (assuming binary classification)
roc_value_randomforest_mode = roc_auc_score(y_test_mode, y_pred_proba_randomforest)
print('ROC AUC:', roc_value_randomforest_mode)

ROC AUC: 0.7977957586650465


## Bagging

In [54]:
clf_bagging_mode = BaggingClassifier(estimator=None, n_estimators=200, random_state=0)
clf_bagging_mode.fit(X_train_mode, y_train_mode)

y_pred_bagging_mode = clf_bagging_mode.predict(X_test_mode)

print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_bagging_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_bagging_mode, average=None))
print(classification_report(y_test_mode, y_pred_bagging_mode))

Accuracy 0.7439092638883708
F1-score [0.59230221 0.81332608]
              precision    recall  f1-score   support

           0       0.70      0.51      0.59      9745
           1       0.76      0.88      0.81     17058

    accuracy                           0.74     26803
   macro avg       0.73      0.69      0.70     26803
weighted avg       0.74      0.74      0.73     26803



In [None]:
y_pred_proba_bagging_mode = clf_bagging_mode.predict_proba(X_test_mode)[:, 1]

# calculate ROC AUC score (assuming binary classification)
roc_value_bagging_mode = roc_auc_score(y_test_mode, y_pred_proba_bagging_mode)
print('ROC AUC:', roc_value_bagging_mode)

ROC AUC: 0.7988913928461018


In [56]:
# Bagging - SVC
clf_bagging_svc_mode = BaggingClassifier(estimator=SVC(C=1, tol=1), n_estimators=100, random_state=0)
clf_bagging_svc_mode.fit(X_train_mode, y_train_mode)

In [57]:
# Bagging - SVC
y_pred_bagging_svc_mode = clf_bagging_svc_mode.predict(X_test_mode)
print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_bagging_svc_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_bagging_svc_mode, average=None))
print(classification_report(y_test_mode, y_pred_bagging_svc_mode))

Accuracy 0.6718277804723352
F1-score [0.36279339 0.77900608]
              precision    recall  f1-score   support

           0       0.62      0.26      0.36      9745
           1       0.68      0.91      0.78     17058

    accuracy                           0.67     26803
   macro avg       0.65      0.58      0.57     26803
weighted avg       0.66      0.67      0.63     26803



In [None]:
y_pred_proba_bagging_svc_mode = clf_bagging_svc_mode.predict_proba(X_test_mode)[:, 1]

# calculate ROC AUC score (assuming binary classification)
roc_value_bagging_svc_mode = roc_auc_score(y_test_mode, y_pred_proba_bagging_svc_mode)
print('ROC AUC:', roc_value_bagging_svc_mode)

ROC AUC: 0.6539015892478268


In [59]:
# Bagging - Random Forest
clf_bagging_randomforest_mode = BaggingClassifier(estimator=RandomForestClassifier(n_estimators=100), n_estimators=100)
clf_bagging_randomforest_mode.fit(X_train_mode, y_train_mode)

In [60]:
# Bagging - Random Forest
y_pred_bagging_randomforest_mode = clf_bagging_randomforest_mode.predict(X_test_mode)
print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_bagging_randomforest_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_bagging_randomforest_mode, average=None))
print(classification_report(y_test_mode, y_pred_bagging_randomforest_mode))

Accuracy 0.7334253628325188
F1-score [0.52716564 0.81439148]
              precision    recall  f1-score   support

           0       0.74      0.41      0.53      9745
           1       0.73      0.92      0.81     17058

    accuracy                           0.73     26803
   macro avg       0.74      0.66      0.67     26803
weighted avg       0.74      0.73      0.71     26803



In [16]:
from sklearn.metrics import roc_auc_score
y_pred_proba = clf_bagging_randomforest_mode.predict_proba(X_test_mode)[:, 1]

# Calculate ROC AUC score (assuming binary classification)
roc_value = roc_auc_score(y_test_mode, y_pred_proba)
print('ROC AUC:', roc_value)

ROC AUC: 0.7928212236512244


## Adaboost

In [61]:
param_grid = {'n_estimators': [100, 200],  # Number of boosting stages
    'learning_rate': [0.1, 0.5, 1.0]  # Learning rate for each boosting stage
}

In [None]:
# create the AdaBoost classifier with a decision tree base estimator by default
ada_boost = AdaBoostClassifier()

# Usdoe GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=ada_boost, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_mode, y_train_mode)



In [63]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")

Best Parameters: {'learning_rate': 0.5, 'n_estimators': 200}


In [64]:
clf_adaboost_mode = AdaBoostClassifier(n_estimators=200,learning_rate=0.5)
clf_adaboost_mode.fit(X_train_mode, y_train_mode)



In [65]:
# Adaboost
y_pred_adaboost_mode = clf_adaboost_mode.predict(X_test_mode)
print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_adaboost_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_adaboost_mode, average=None))
print(classification_report(y_test_mode, y_pred_adaboost_mode))

Accuracy 0.6930567473790248
F1-score [0.46518884 0.7847631 ]
              precision    recall  f1-score   support

           0       0.63      0.37      0.47      9745
           1       0.71      0.88      0.78     17058

    accuracy                           0.69     26803
   macro avg       0.67      0.62      0.62     26803
weighted avg       0.68      0.69      0.67     26803



In [None]:
y_pred_proba_adaboost_mode = clf_adaboost_mode.predict_proba(X_test_mode)[:, 1]

# calculate ROC AUC score (assuming binary classification)
roc_value_adaboost_mode = roc_auc_score(y_test_mode, y_pred_proba_adaboost_mode)
print('ROC AUC:', roc_value_adaboost_mode)

ROC AUC: 0.714533769162657


In [67]:
# Adaboost - Random Forest
clf_adaboost_randomforest_mode = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=200), n_estimators=200, random_state=0)
clf_adaboost_randomforest_mode.fit(X_train_mode, y_train_mode)



In [68]:
y_pred_adaboost_randomforest_mode = clf_adaboost_randomforest_mode.predict(X_test_mode)
print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_adaboost_randomforest_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_adaboost_randomforest_mode, average=None))
print(classification_report(y_test_mode, y_pred_adaboost_randomforest_mode))

Accuracy 0.7407752863485431
F1-score [0.55728304 0.81673349]
              precision    recall  f1-score   support

           0       0.74      0.45      0.56      9745
           1       0.74      0.91      0.82     17058

    accuracy                           0.74     26803
   macro avg       0.74      0.68      0.69     26803
weighted avg       0.74      0.74      0.72     26803



In [None]:
y_pred_proba_adaboost_randomforest_mode = clf_adaboost_randomforest_mode.predict_proba(X_test_mode)[:, 1]

# calculate ROC AUC score (assuming binary classification)
roc_value_adaboost_randomforest_mode = roc_auc_score(y_test_mode, y_pred_proba_adaboost_randomforest_mode)
print('ROC AUC:', roc_value_adaboost_randomforest_mode)

ROC AUC: 0.7966709571022019
