In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from collections import defaultdict

In [2]:
df = pd.read_csv('df_without_outliers.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
df_new = df.copy()

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

## Popularity Class

In [5]:
def classify_popularity(popularity):
    if popularity <= 40:
        return 'low'
    elif 40 < popularity <= 70:
        return 'medium'
    else:
        return 'high'

df_new['popularity_class'] = df_new['popularity'].apply(classify_popularity)

In [6]:
df_new.drop(columns=['popularity'], inplace=True)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [8]:
X = df_new.drop(columns=['popularity_class']).values
y = np.array(df_new['popularity_class'])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [10]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
# classifier definition (assuming RandomForestClassifier)
clf = RandomForestClassifier(random_state=0)

# define parameter grid
param_grid = {
  'n_estimators': [10, 100, 200],
  'criterion': ["gini", "entropy", "log_loss"],
}

# do GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)

In [None]:
grid_search.fit(X_train, y_train)

# get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'criterion': 'entropy', 'n_estimators': 200}
Best Score: 0.7884806868243326


In [24]:
clf = RandomForestClassifier(random_state=42, n_estimators=200, criterion= 'entropy', max_depth = 30, min_samples_split = 2, min_samples_leaf = 1)
clf.fit(X_train, y_train)

In [25]:
y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.7898369585494162
F1-score [0.22057461 0.86146569 0.68085595]
              precision    recall  f1-score   support

        high       0.66      0.13      0.22       900
         low       0.84      0.88      0.86     17155
      medium       0.68      0.68      0.68      8748

    accuracy                           0.79     26803
   macro avg       0.73      0.56      0.59     26803
weighted avg       0.78      0.79      0.78     26803



## Mode Class

In [26]:
df_new_2= df.copy()

In [27]:
df_new_2['mode'].value_counts()

mode
1    56906
0    32435
Name: count, dtype: int64

In [28]:
X_mode = df_new_2.drop(columns=['mode']).values
y_mode = np.array(df_new_2['mode'])

In [29]:
X_train_mode, X_test_mode, y_train_mode, y_test_mode = train_test_split(
    X_mode, y_mode, test_size=0.3, random_state=100)

In [30]:
scl = StandardScaler()
X_train_mode = scl.fit_transform(X_train_mode)
X_test_mode = scl.transform(X_test_mode)

In [None]:
# classifier definition (assuming RandomForestClassifier)
clf_mode = RandomForestClassifier(random_state=0)

# define parameter grid
param_grid = {
  'n_estimators': [10, 100, 200],
  'criterion': ["gini", "entropy", "log_loss"],
}

# do GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=clf_mode, param_grid=param_grid, cv=5)

In [None]:
# fit the model for grid search
grid_search.fit(X_train_mode, y_train_mode)

# get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'criterion': 'entropy', 'n_estimators': 200}
Best Score: 0.7346253325512194


In [34]:
clf_mode = RandomForestClassifier(random_state=42, n_estimators=200, criterion= 'entropy', max_depth = 30, min_samples_split = 2, min_samples_leaf = 1)
clf_mode.fit(X_train_mode, y_train_mode)

In [36]:
y_pred_mode = clf_mode.predict(X_test_mode)

print('Accuracy %s' % accuracy_score(y_test_mode, y_pred_mode))
print('F1-score %s' % f1_score(y_test_mode, y_pred_mode, average=None))
print(classification_report(y_test_mode, y_pred_mode))

Accuracy 0.7380144013729807
F1-score [0.5492361  0.81534659]
              precision    recall  f1-score   support

           0       0.73      0.44      0.55      9745
           1       0.74      0.91      0.82     17058

    accuracy                           0.74     26803
   macro avg       0.74      0.67      0.68     26803
weighted avg       0.74      0.74      0.72     26803

