In [24]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
pd.set_option('display.max_columns', None)  # Show all columns in DataFrame output
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [17]:
df = pd.read_csv(r'D:\BOOKS\UNIPI\Sem 2\Data Mining 2\Project\imdb_3.csv')
df.drop(columns = ['canHaveEpisodes'], inplace = True)
df

Unnamed: 0,startYear,runtimeMinutes,totalCredits,numRegions,ratingCount,castNumber,companiesNumber,writerCredits,directorsCredits,totalNominations,totalMedia,totalReviews,Asia,Africa,Europe,North America,South America,Oceania,Continent Unknown,genre1,genre2,genre3,movie,short,tvEpisode,tvMiniSeries,tvMovie,tvSeries,tvShort,tvSpecial,video,videoGame,rating_bin
0,1894,1,4,7,2092,1,3,0,1,0,11,19,1,0,5,1,0,0,0,16787,16581,0,0,1,0,0,0,0,0,0,0,0,1
1,1892,12,2,6,183,0,0,0,1,0,5,1,1,0,5,0,0,0,0,17855,16787,0,0,1,0,0,0,0,0,0,0,0,0
2,1894,1,1,5,195,0,1,0,1,0,5,0,0,0,4,1,0,0,0,16787,0,0,0,1,0,0,0,0,0,0,0,0,0
3,1894,1,4,6,2238,1,6,0,1,1,9,22,1,0,4,1,0,0,0,16787,16581,0,0,1,0,0,0,0,0,0,0,0,0
4,1896,1,11,21,13115,6,5,0,2,0,33,82,2,0,14,2,1,0,2,16787,16581,0,0,1,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149526,1993,96,11,1,11,0,2,1,1,0,1,0,0,0,0,0,1,0,0,16581,0,0,1,0,0,0,0,0,0,0,0,0,2
149527,2019,14,52,1,15,6,3,1,1,3,10,1,0,0,1,0,0,0,0,51745,16787,0,0,1,0,0,0,0,0,0,0,0,1
149528,2019,29,32,0,12,1,0,0,0,0,5,0,0,0,0,0,0,0,0,5726,0,0,0,0,1,0,0,0,0,0,0,0,0
149529,2011,10,15,0,10,7,0,3,1,0,1,0,0,0,0,0,0,0,0,47408,17855,17763,0,0,1,0,0,0,0,0,0,0,2


# Splitting into test(20%) and train(80%) data

In [18]:
# Define features (X) and target (y)
X = df.drop(columns=['rating_bin'])
y = df['rating_bin']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# sampling the dataset for faster result

In [19]:
# Stratified sampling to ensure class distribution is maintained
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
for train_idx, test_idx in stratified_split.split(X, y):
    X_sampled = X.iloc[test_idx]
    y_sampled = y.iloc[test_idx]

# Display the shape of the sampled data
print("Sampled X shape:", X_sampled.shape)
print("Sampled y shape:", y_sampled.shape)

# Split the sampled data into training and testing sets
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = train_test_split(
    X_sampled, y_sampled, test_size=0.2, random_state=42
)

# Standardize the features for the sampled dataset
X_train_sampled_scaled = scaler.fit_transform(X_train_sampled)
X_test_sampled_scaled = scaler.transform(X_test_sampled)

Sampled X shape: (14954, 32)
Sampled y shape: (14954,)


# K-Nearest Neighbour

In [20]:
%%time
# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors as needed
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.31815294078309425

Classification Report:
               precision    recall  f1-score   support

           0       0.37      0.53      0.43      4914
           1       0.28      0.32      0.30      5437
           2       0.26      0.23      0.25      5320
           3       0.24      0.20      0.22      4189
           4       0.34      0.30      0.32      5393
           5       0.40      0.32      0.35      4654

    accuracy                           0.32     29907
   macro avg       0.32      0.32      0.31     29907
weighted avg       0.32      0.32      0.31     29907

CPU times: total: 18.9 s
Wall time: 11.2 s


# grid search on sampled data

In [None]:
%%time
# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 7, 11, 21, 43, 122], #122 is sqrt of 14954 (total samples in sampled data)
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_sampled_scaled, y_train_sampled)

# Display the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Use the best estimator to make predictions
best_knn = grid_search.best_estimator_
y_pred_best = best_knn.predict(X_test_sampled_scaled)

# Evaluate the optimized model
print("Optimized KNN Accuracy:", accuracy_score(y_test_sampled, y_pred_best))
print("\nOptimized KNN Classification Report:\n", classification_report(y_test_sampled, y_pred_best))



Best Parameters: {'metric': 'manhattan', 'n_neighbors': 122, 'weights': 'distance'}
Best Cross-Validation Accuracy: 0.3015960011572214
Optimized KNN Accuracy: 0.31093279839518556

Optimized KNN Classification Report:
               precision    recall  f1-score   support

           0       0.37      0.61      0.46       494
           1       0.31      0.29      0.30       536
           2       0.24      0.08      0.12       566
           3       0.22      0.09      0.12       391
           4       0.32      0.40      0.36       539
           5       0.27      0.39      0.32       465

    accuracy                           0.31      2991
   macro avg       0.29      0.31      0.28      2991
weighted avg       0.29      0.31      0.28      2991

CPU times: total: 2.56 s
Wall time: 37.3 s


In [33]:
%%time
# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=122, metric='manhattan', weights = 'distance')  # You can adjust n_neighbors as needed
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.3577088975825058

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.59      0.49      4914
           1       0.34      0.33      0.34      5437
           2       0.30      0.19      0.23      5320
           3       0.32      0.13      0.18      4189
           4       0.34      0.42      0.38      5393
           5       0.36      0.47      0.41      4654

    accuracy                           0.36     29907
   macro avg       0.35      0.35      0.34     29907
weighted avg       0.35      0.36      0.34     29907

CPU times: total: 1min 54s
Wall time: 1min 6s


# Decision Tree

In [25]:
%%time
dtc = DecisionTreeClassifier(random_state=42)# Initialize the Decision Tree Classifier
dtc.fit(X_train_scaled, y_train)

# Make predictions
y_pred = dtc.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.31377269535560237

Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.42      0.42      4914
           1       0.30      0.31      0.30      5437
           2       0.25      0.25      0.25      5320
           3       0.23      0.23      0.23      4189
           4       0.32      0.31      0.31      5393
           5       0.37      0.36      0.36      4654

    accuracy                           0.31     29907
   macro avg       0.31      0.31      0.31     29907
weighted avg       0.31      0.31      0.31     29907

CPU times: total: 1.42 s
Wall time: 1.47 s


In [29]:
%%time

# Define the parameter grid for Decision Tree Classifier
param_grid_dtc = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 8, 16]
}

# Initialize the GridSearchCV object
grid_search_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid_dtc, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search_dtc.fit(X_train_sampled_scaled, y_train_sampled)

# Display the best parameters and the best score
print("Best Parameters:", grid_search_dtc.best_params_)
print("Best Cross-Validation Accuracy:", grid_search_dtc.best_score_)

# Use the best estimator to make predictions
best_dtc = grid_search_dtc.best_estimator_
y_pred_best_dtc = best_dtc.predict(X_test_sampled_scaled)

# Evaluate the optimized model
print("Optimized Decision Tree Accuracy:", accuracy_score(y_test_sampled, y_pred_best_dtc))
print("\nOptimized Decision Tree Classification Report:\n", classification_report(y_test_sampled, y_pred_best_dtc))

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'splitter': 'random'}
Best Cross-Validation Accuracy: 0.2966646028620265
Optimized Decision Tree Accuracy: 0.26947509194249414

Optimized Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.45      0.39       494
           1       0.26      0.26      0.26       536
           2       0.21      0.17      0.19       566
           3       0.19      0.11      0.14       391
           4       0.26      0.33      0.29       539
           5       0.29      0.27      0.28       465

    accuracy                           0.27      2991
   macro avg       0.26      0.27      0.26      2991
weighted avg       0.26      0.27      0.26      2991

CPU times: total: 7.92 s
Wall time: 2min 7s


In [28]:
dtc = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=4, min_samples_split=10, splitter = 'random')# Initialize the Decision Tree Classifier
dtc.fit(X_train_scaled, y_train)

# Make predictions
y_pred = dtc.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.3193566723509546

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.51      0.46      4914
           1       0.33      0.27      0.30      5437
           2       0.26      0.13      0.17      5320
           3       0.28      0.04      0.08      4189
           4       0.29      0.51      0.37      5393
           5       0.30      0.42      0.35      4654

    accuracy                           0.32     29907
   macro avg       0.31      0.31      0.29     29907
weighted avg       0.31      0.32      0.29     29907

