In [1]:
from support.df_init import init

df = init()

In [2]:
df['awful_rating'] = ((df['averageRating'] >= 0) & (df['averageRating'] <= 4)).astype(int)
# df['low_rating'] = ((df['averageRating'] > 4) & (df['averageRating'] <= 5)).astype(int)
df['medium_rating'] = ((df['averageRating'] > 4) & (df['averageRating'] <= 7)).astype(int)
df['high_rating'] = ((df['averageRating'] > 7) & (df['averageRating'] <= 10)).astype(int)

df[['awful_rating', 'medium_rating', 'high_rating']]

Unnamed: 0,awful_rating,medium_rating,high_rating
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,0,1
...,...,...,...
149526,0,1,0
149527,0,1,0
149528,0,1,0
149529,0,0,1


In [3]:
rating_features = ['awful_rating', 'medium_rating', 'high_rating', ]
percentages = (df[rating_features].sum() / len(df)) * 100
print(percentages)

awful_rating      3.468846
medium_rating    46.439869
high_rating      50.091285
dtype: float64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# Splitting the dataset into training and testing sets while maintaining the same ratio of rating features

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df[rating_features]):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]
train_set, test_set = train_test_split(df, test_size=0.3, random_state=42)
train_set.to_csv("dm2_dataset_2425_imdb/imbalanced_train_70.csv", index=False)
test_set.to_csv("dm2_dataset_2425_imdb/imbalanced_test_30.csv", index=False)

In [5]:
train_set

Unnamed: 0,originalTitle,rating,startYear,endYear,runtimeMinutes,awardWins,numVotes,worstRating,bestRating,totalImages,...,countryOfOrigin_AF,countryOfOrigin_AS,countryOfOrigin_EU,countryOfOrigin_OC,countryOfOrigin_SA,countryOfOrigin_UNK,reviewsTotal,awful_rating,medium_rating,high_rating
35104,Wild Force,"(4, 5]",1986,1986.0,84.0,0,21,1,10,4,...,0,1,0,0,0,0,4,0,1,0
90639,Ture Sventon och jakten på ungdomens källa,"(7, 8]",2021,,,0,66,1,10,11,...,0,0,1,0,0,0,0,0,0,1
9488,Kaala Sona,"(6, 7]",1975,1975.0,137.0,0,93,1,10,14,...,0,1,0,0,0,0,4,0,1,0
36627,Deu Veado na Cabeça,"(3, 4]",1982,1982.0,90.0,0,34,1,10,1,...,0,0,0,0,1,0,0,1,0,0
133214,10 Homes That Changed America,"(7, 8]",2016,2016.0,56.0,0,41,1,10,3,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,September Fools,"(8, 9]",2013,2013.0,22.0,0,114,1,10,3,...,0,0,0,0,0,0,1,0,0,1
103694,Hiphop Hamlet,"(7, 8]",2010,,30.0,0,6,1,10,1,...,0,0,1,0,0,0,0,0,0,1
131932,Episode #1.12,"(7, 8]",2013,2013.0,58.0,0,32,1,10,2,...,0,1,0,0,0,0,0,0,0,1
146867,2019 Golden Globe Awards,"(5, 6]",2019,2019.0,145.0,1,487,1,10,623,...,0,0,0,0,0,0,10,0,1,0


In [6]:
# TODO: add outlier detection+handling

from support.imputation import impute_data

train_set, test_set = impute_data(train=train_set, test=test_set)

In [7]:
from support.embedding import embedding

train_set, test_set = embedding(train=train_set, test=test_set)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from support.transformations import apply_transformations

train_set, test_set = apply_transformations(train=train_set, test=test_set)

In [9]:
from support.scaling import scale_data

train_set, test_set = scale_data(train=train_set, test=test_set)

In [10]:
feats = [
    'startYear', 'runtimeMinutes',
    'totalCredits', 'numRegions', 'ratingCount',
    'castNumber', 'companiesNumber', 'writerCredits',
    'directorsCredits', 'quotesTotal', 'totalMedia',
    'totalNominations', 'regions_freq_enc', 'regions_EU',
    'regions_NA', 'regions_AS', 'regions_AF', 'regions_OC', 'regions_SA',
    'regions_UNK', 'countryOfOrigin_freq_enc', 'countryOfOrigin_NA',
    'countryOfOrigin_AF', 'countryOfOrigin_AS', 'countryOfOrigin_EU',
    'countryOfOrigin_OC', 'countryOfOrigin_SA', 'countryOfOrigin_UNK',
    'reviewsTotal',
    'genre_Action', 'genre_Adult', 'genre_Adventure', 'genre_Animation',
    'genre_Biography', 'genre_Comedy', 'genre_Crime', 'genre_Documentary',
    'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Film-Noir',
    'genre_Game-Show', 'genre_History', 'genre_Horror', 'genre_Music',
    'genre_Musical', 'genre_Mystery', 'genre_News', 'genre_Reality-TV',
    'genre_Romance', 'genre_Sci-Fi', 'genre_Short', 'genre_Sport',
    'genre_Talk-Show', 'genre_Thriller', 'genre_War', 'genre_Western',
    'titleType_movie', 'titleType_short', 'titleType_tvEpisode',
    'titleType_tvMiniSeries', 'titleType_tvMovie', 'titleType_tvSeries',
    'titleType_tvShort', 'titleType_tvSpecial', 'titleType_video',
    'titleType_videoGame'
]

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

def train_decision_tree(X_train, y_train, X_test, y_test):
    """
    Train a Decision Tree Classifier and evaluate its performance.

    Parameters:
    - X_train: Training features (DataFrame or array-like)
    - y_train: Training target (DataFrame or array-like)
    - X_test: Testing features (DataFrame or array-like)
    - y_test: Testing target (DataFrame or array-like)

    Returns:
    - clf: Trained Decision Tree Classifier
    - accuracy: Accuracy of the model on the test set
    - report: Classification report as a string
    """
    # Initialize and train the Decision Tree Classifier
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
    report = classification_report(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))

    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

    return clf, accuracy, report

# Example usage:
train_decision_tree(train_set[feats],   
                    train_set[rating_features],
                    test_set[feats],
                    test_set[rating_features])

Accuracy: 0.6440258582255908
Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.22      0.21      1585
           1       0.63      0.63      0.63     20663
           2       0.69      0.68      0.69     22612

    accuracy                           0.64     44860
   macro avg       0.51      0.51      0.51     44860
weighted avg       0.65      0.64      0.64     44860



(DecisionTreeClassifier(random_state=42),
 0.6440258582255908,
 '              precision    recall  f1-score   support\n\n           0       0.20      0.22      0.21      1585\n           1       0.63      0.63      0.63     20663\n           2       0.69      0.68      0.69     22612\n\n    accuracy                           0.64     44860\n   macro avg       0.51      0.51      0.51     44860\nweighted avg       0.65      0.64      0.64     44860\n')

In [12]:
from sklearn.utils import resample
import pandas as pd

# Separate the classes
awful_rating_1 = train_set[train_set['awful_rating'] == 1]

# Determine the size of the least present class
min_class_size = len(awful_rating_1)

# Downsample the other classes to match the size of the least present class
medium_rating = train_set[train_set['medium_rating'] == 1]
high_rating = train_set[train_set['high_rating'] == 1]

medium_rating_downsampled = resample(medium_rating,
                                     replace=False,
                                     n_samples=min_class_size,
                                     random_state=42)

high_rating_downsampled = resample(high_rating,
                                   replace=False,
                                   n_samples=min_class_size,
                                   random_state=42)

# Combine all classes into a balanced dataset
balanced_train_set = pd.concat([awful_rating_1, medium_rating_downsampled, high_rating_downsampled])

# Define features and target
X_balanced = balanced_train_set[feats]
y_balanced = balanced_train_set[rating_features]

train_decision_tree(X_balanced, y_balanced, test_set[feats], test_set[rating_features])

Accuracy: 0.5051716451181454
Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.59      0.16      1585
           1       0.57      0.45      0.50     20663
           2       0.67      0.55      0.61     22612

    accuracy                           0.51     44860
   macro avg       0.44      0.53      0.42     44860
weighted avg       0.60      0.51      0.54     44860



(DecisionTreeClassifier(random_state=42),
 0.5051716451181454,
 '              precision    recall  f1-score   support\n\n           0       0.09      0.59      0.16      1585\n           1       0.57      0.45      0.50     20663\n           2       0.67      0.55      0.61     22612\n\n    accuracy                           0.51     44860\n   macro avg       0.44      0.53      0.42     44860\nweighted avg       0.60      0.51      0.54     44860\n')

In [13]:
import numpy as np

conditions = [
    train_set['averageRating'] <= 4,
    train_set['averageRating'] <= 8
]
choices = ['awful', 'medium']

train_set['ratingQuality'] = np.select(conditions, choices, default='high')

conditions = [
    test_set['averageRating'] <= 4,
    test_set['averageRating'] <= 8
]
test_set['ratingQuality'] = np.select(conditions, choices, default='high')

# ENN

In [14]:
from imblearn.under_sampling import EditedNearestNeighbours

# Define the ENN undersampler
enn = EditedNearestNeighbours(n_neighbors=70, kind_sel='mode')

# Convert the target column 'ratingQuality' to numeric values
rating_quality_mapping = {'awful': 0, 'medium': 1, 'high': 2}
train_set['ratingQualityNumeric'] = train_set['ratingQuality'].map(rating_quality_mapping)

# Apply EditedNearestNeighbours with the numeric target column
X_resampled, y_resampled = enn.fit_resample(
    X=train_set[feats + rating_features], y=train_set['ratingQualityNumeric'])

# Display the resampled data
print("Resampled X shape:", X_resampled.shape)
print("Resampled y shape:", y_resampled.shape)
(
    len(X_resampled.loc[X_resampled['awful_rating'] == 1]),
    len(X_resampled.loc[X_resampled['medium_rating'] == 1]),
    len(X_resampled.loc[X_resampled['high_rating'] == 1])
)

Resampled X shape: (87238, 70)
Resampled y shape: (87238,)


(3602, 48661, 34975)

In [15]:
train_decision_tree(
    X_resampled[feats], X_resampled[rating_features],
    test_set[feats], test_set[rating_features]
)

Accuracy: 0.6225590726705306
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.22      0.20      1585
           1       0.60      0.68      0.63     20663
           2       0.70      0.60      0.65     22612

    accuracy                           0.62     44860
   macro avg       0.49      0.50      0.49     44860
weighted avg       0.63      0.62      0.62     44860



(DecisionTreeClassifier(random_state=42),
 0.6225590726705306,
 '              precision    recall  f1-score   support\n\n           0       0.18      0.22      0.20      1585\n           1       0.60      0.68      0.63     20663\n           2       0.70      0.60      0.65     22612\n\n    accuracy                           0.62     44860\n   macro avg       0.49      0.50      0.49     44860\nweighted avg       0.63      0.62      0.62     44860\n')

# AllKNN

In [16]:
from imblearn.under_sampling import AllKNN

allknn = AllKNN(n_neighbors=5, kind_sel='mode')

X_resampled, y_resampled = allknn.fit_resample(
    X=train_set[feats + rating_features],
    y=train_set['ratingQualityNumeric']
)

# Display the resampled data
print("Resampled X shape:", X_resampled.shape)
print("Resampled y shape:", y_resampled.shape)
(
    len(X_resampled.loc[X_resampled['awful_rating'] == 1]),
    len(X_resampled.loc[X_resampled['medium_rating'] == 1]),
    len(X_resampled.loc[X_resampled['high_rating'] == 1])
)

Resampled X shape: (77383, 70)
Resampled y shape: (77383,)


(3602, 45599, 28182)

In [17]:
train_decision_tree(
    X_resampled[feats], X_resampled[rating_features],
    test_set[feats], test_set[rating_features]
)

Accuracy: 0.6108782880071333
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.25      0.21      1585
           1       0.58      0.70      0.63     20663
           2       0.70      0.56      0.62     22612

    accuracy                           0.61     44860
   macro avg       0.49      0.50      0.49     44860
weighted avg       0.63      0.61      0.61     44860



(DecisionTreeClassifier(random_state=42),
 0.6108782880071333,
 '              precision    recall  f1-score   support\n\n           0       0.18      0.25      0.21      1585\n           1       0.58      0.70      0.63     20663\n           2       0.70      0.56      0.62     22612\n\n    accuracy                           0.61     44860\n   macro avg       0.49      0.50      0.49     44860\nweighted avg       0.63      0.61      0.61     44860\n')

# Balance+ENN

In [18]:
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.pipeline import Pipeline

# Step 1: Balance the classes with RandomUnderSampler
min_class_size = train_set['ratingQuality'].value_counts().min()
sampling_strategy = {
    label: min_class_size for label in train_set['ratingQualityNumeric'].unique()
}

# Step 2: Create pipeline: balance first, then ENN
pipeline = Pipeline([
    ('undersample', RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)),
    ('enn', EditedNearestNeighbours(n_neighbors=20, kind_sel='mode'))
])

# Step 3: Apply the pipeline
X_resampled, y_resampled = pipeline.fit_resample(
    X=train_set[feats + rating_features],
    y=train_set['ratingQualityNumeric']
)

# Step 4: Count class samples after resampling
print("Resampled X shape:", X_resampled.shape)
print("Resampled y shape:", y_resampled.shape)

# Step 5: If you still have one-hot labels in X_resampled
class_counts = (
    len(X_resampled.loc[X_resampled['awful_rating'] == 1]),
    len(X_resampled.loc[X_resampled['medium_rating'] == 1]),
    len(X_resampled.loc[X_resampled['high_rating'] == 1])
)
print("Class counts after resampling:", class_counts)


Resampled X shape: (7693, 70)
Resampled y shape: (7693,)
Class counts after resampling: (3602, 1064, 3027)


In [19]:
train_decision_tree(
    X_resampled[feats], X_resampled[rating_features],
    test_set[feats], test_set[rating_features]
)

Accuracy: 0.4606776638430673
Classification Report:
               precision    recall  f1-score   support

           0       0.08      0.76      0.14      1585
           1       0.60      0.26      0.37     20663
           2       0.68      0.62      0.65     22612

    accuracy                           0.46     44860
   macro avg       0.45      0.55      0.39     44860
weighted avg       0.62      0.46      0.50     44860



(DecisionTreeClassifier(random_state=42),
 0.4606776638430673,
 '              precision    recall  f1-score   support\n\n           0       0.08      0.76      0.14      1585\n           1       0.60      0.26      0.37     20663\n           2       0.68      0.62      0.65     22612\n\n    accuracy                           0.46     44860\n   macro avg       0.45      0.55      0.39     44860\nweighted avg       0.62      0.46      0.50     44860\n')

In [20]:
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.pipeline import Pipeline

# Step 1: Balance the classes with RandomUnderSampler
min_class_size = train_set['ratingQuality'].value_counts().min()
sampling_strategy = {
    label: min_class_size for label in train_set['ratingQualityNumeric'].unique()
}

# Step 2: Create pipeline: balance first, then ENN
pipeline = Pipeline([
    ('enn', EditedNearestNeighbours(n_neighbors=100, kind_sel='mode')),
    ('undersample', RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)),
])

# Step 3: Apply the pipeline
X_resampled, y_resampled = pipeline.fit_resample(
    X=train_set[feats + rating_features],
    y=train_set['ratingQualityNumeric']
)

# Step 4: Count class samples after resampling
print("Resampled X shape:", X_resampled.shape)
print("Resampled y shape:", y_resampled.shape)

# Step 5: If you still have one-hot labels in X_resampled
class_counts = (
    len(X_resampled.loc[X_resampled['awful_rating'] == 1]),
    len(X_resampled.loc[X_resampled['medium_rating'] == 1]),
    len(X_resampled.loc[X_resampled['high_rating'] == 1])
)
print("Class counts after resampling:", class_counts)


ValueError: With under-sampling methods, the number of samples in a class should be less or equal to the original number of samples. Originally, there is 1664 samples and 3602 samples are asked.

In [None]:
train_decision_tree(
    X_resampled[feats], X_resampled[rating_features],
    test_set[feats], test_set[rating_features]
)

Accuracy: 0.5327240303165404
Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.68      0.15      1585
           1       0.84      0.56      0.67     35320
           2       0.34      0.36      0.35      7955

    accuracy                           0.53     44860
   macro avg       0.42      0.54      0.39     44860
weighted avg       0.72      0.53      0.60     44860



(DecisionTreeClassifier(random_state=42),
 0.5327240303165404,
 '              precision    recall  f1-score   support\n\n           0       0.09      0.68      0.15      1585\n           1       0.84      0.56      0.67     35320\n           2       0.34      0.36      0.35      7955\n\n    accuracy                           0.53     44860\n   macro avg       0.42      0.54      0.39     44860\nweighted avg       0.72      0.53      0.60     44860\n')

# DBSCAN+k-Means

In [21]:
from sklearn.cluster import DBSCAN, KMeans
import numpy as np
import pandas as pd

# Step 1: Apply DBSCAN to eliminate noise points
dbscan = DBSCAN(eps=3, min_samples=3)
dbscan_labels = dbscan.fit_predict(train_set[feats])

# Filter out noise points (label -1 indicates noise in DBSCAN)
X_dbscan_filtered = train_set[dbscan_labels != -1][feats+rating_features]
y_dbscan_filtered = train_set[dbscan_labels != -1]['ratingQualityNumeric']

# Step 2: Apply k-Means to oversample by keeping medoids on larger classes
undersampled_data = []
undersampled_labels = []

# Get the min class size
min_class_size = y_dbscan_filtered.value_counts().min()

for label in np.unique(y_dbscan_filtered):
    # Extract data for the current class
    class_data = X_dbscan_filtered[y_dbscan_filtered == label]
    
    # Apply k-Means clustering
    kmeans = KMeans(n_clusters=min_class_size, random_state=42)
    kmeans.fit(class_data)
    
    # Keep medoids (cluster centers)
    undersampled_data.append(kmeans.cluster_centers_)
    undersampled_labels.extend([label] * min_class_size)

# Combine oversampled data and labels
X_undersampled = pd.DataFrame(np.vstack(undersampled_data), columns=feats+rating_features)
y_undersampled = pd.Series(undersampled_labels)

# Display the oversampled data
print("Undersampled X shape:", X_undersampled.shape)
print("Undersampled y shape:", y_undersampled.shape)

class_counts = y_undersampled.value_counts()
print("Class counts after resampling:", class_counts)

  return fit_method(estimator, *args, **kwargs)


Undersampled X shape: (8397, 70)
Undersampled y shape: (8397,)
Class counts after resampling: 0    2799
1    2799
2    2799
Name: count, dtype: int64


In [22]:
train_decision_tree(
    X_undersampled[feats], X_undersampled[rating_features],
    test_set[feats], test_set[rating_features]
)

ValueError: Unknown label type: continuous-multioutput. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.