# Table of contents
1. [Libraries](#libraries)
2. [Numerical columns](#numerical)
3. [Categorical columns](#categorical)



<h1 id = "libraries"> 1. Libraries </h1>

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from scipy.stats import sem

from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


In [3]:
raw_data = pd.read_csv('../Data/retyped_data.csv')

<div id = "numerical"> <h1>Evalutation </h1> </div>

#### This Section is dedicated to evaluate imputation using similarity strategy

+ Function to fill missing using similarity strategy

In [4]:

batch_size = 32
start = 0
end = 32

# Define the function to calculate similarities
def calculate_similarities(ratings, batch_start, batch_end):
    # Select the batch of users
    batch_ratings = ratings[batch_start:batch_end]
    
    # Calculate the absolute difference between the batch and all users
    abs_diff = np.abs(ratings - batch_ratings.reshape(batch_end - batch_start, 1, ratings.shape[1]))
    
    # Calculate the mean absolute difference across movies, ignoring NaN values
    mean_diff = np.nanmean(abs_diff, axis=2)
    
    # Compute similarity as the inverse of the mean absolute difference
    similarities = 1 / (mean_diff + 0.001)  # Adding a small epsilon to avoid division by zero
    similarities[np.isnan(similarities)] = 0
    return similarities

def fill_missing(data, batch_size = 32):
    n_movies = data.shape[0]
    filled_ratings = np.empty_like(data)
    num_batches = int(np.ceil(n_movies / batch_size))

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, n_movies)

        similarities = calculate_similarities(data, start, end)
        
        weights = ~np.isnan(data) * similarities.reshape(end - start, -1, 1)
        weights /= weights.sum(axis=1, keepdims=True)

        filled_ratings[start:end] = np.nansum(data * weights, axis=1)

    return filled_ratings



+ Evaluate each columns

In [5]:
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']

test_size = 0.3

#Evalue each column seperatly
for test_col in columns:
    #Get a copy of data but remove all null value for testing
    raw_data_copy = raw_data.copy()
    raw_data_copy.dropna(inplace=True)
    raw_data_copy.reset_index(drop=True, inplace=True)

    #Get sample with size of 30%
    test_rows = raw_data_copy.sample(frac=test_size, random_state=42).index

    #Get y true value from the dataset
    y_test = raw_data_copy.loc[test_rows, test_col].copy()

    #Assign it's as nan value for imputing
    raw_data_copy.loc[test_rows, test_col] = np.nan

    #Perform imputing missing value using similarity
    tmp_data = raw_data_copy.copy()
    tmp_data['id'] = tmp_data.index
    tmp_data['Meta UserScore'] = tmp_data['Meta UserScore'] * 10

    tmp_data = tmp_data[['id','Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']].to_numpy()

    filled_ratings = fill_missing(tmp_data)
    filled_nanvals = filled_ratings[np.isnan(tmp_data)]

    tmp_data[np.isnan(tmp_data)] = filled_nanvals

    filled_df = pd.DataFrame(
        filled_ratings[:, 1:],
        columns=['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']
    )

    filled_df['Meta UserScore'] /= 10
    tmp_data_2 = raw_data_copy.copy()

    for col in filled_df.columns:
        tmp_data_2[col].fillna(filled_df[col], inplace=True)

    #Get y predicted (column after imputing)
    y_pred = tmp_data_2.loc[test_rows, test_col]

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Sumary of {test_col}: ')
    print('Mean absolute error: ', mae)
    print('Mean squared error: ', mse)
    print('R2 score: ', r2)
    print("--------\n")

Sumary of Tomatoes CriticScore: 
Mean absolute error:  19.754701330716717
Mean squared error:  525.6943104026549
R2 score:  0.31772289189241265
--------

Sumary of Tomatoes UserScore: 
Mean absolute error:  14.239382392822442
Mean squared error:  289.0440282119497
R2 score:  0.3071021599944561
--------

Sumary of Metascore: 
Mean absolute error:  12.497650376541852
Mean squared error:  232.4710433204248
R2 score:  0.3416715140083487
--------

Sumary of Meta UserScore: 
Mean absolute error:  0.8212265709625988
Mean squared error:  1.084668638219071
R2 score:  0.29630987159548206
--------



The Loss seem too high, suggest using other imputation strategy (KNN, Decision Tree)

#### This Section is dedicated to evaluate imputation using KNN strategy

In [8]:

# Prepare the data (numeric columns only)
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']
test_size = 0.3

for test_col in columns:
    # Step 1: Prepare the test dataset
    raw_data_copy = raw_data.copy()
    raw_data_copy.dropna(inplace=True)
    raw_data_copy.reset_index(drop=True, inplace=True)

    # Select test rows (30% of the data)
    test_rows = raw_data_copy.sample(frac=test_size, random_state=42).index
    y_test = raw_data_copy.loc[test_rows, test_col].copy()

    # Mask test column values (set them to NaN for imputation)
    raw_data_copy.loc[test_rows, test_col] = np.nan

    # Step 2: Apply KNN Imputation
    knn_imputer = KNNImputer(n_neighbors=20, weights="uniform")
    imputed_data = knn_imputer.fit_transform(raw_data_copy[columns])

    # Reconstruct the imputed DataFrame
    imputed_df = pd.DataFrame(imputed_data, columns=columns)

    # Step 3: Evaluate the imputed values
    y_pred = imputed_df.loc[test_rows, test_col]

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Summary of {test_col}:')
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R² Score:', r2)
    print('--------\n')

Summary of Tomatoes CriticScore:
Mean Absolute Error: 7.434468937875752
Mean Squared Error: 102.7994488977956
R² Score: 0.8665808069040736
--------

Summary of Tomatoes UserScore:
Mean Absolute Error: 10.939478957915831
Mean Squared Error: 188.24716933867734
R² Score: 0.548732911630053
--------

Summary of Metascore:
Mean Absolute Error: 5.980160320641283
Mean Squared Error: 61.349383767535066
R² Score: 0.8262663325490669
--------

Summary of Meta UserScore:
Mean Absolute Error: 0.5763476953907815
Mean Squared Error: 0.6138563877755512
R² Score: 0.60175424538414
--------



The loss seemed fairly low, good stuff

#### This Section is dedicated to evaluate imputation using Decision Tree strategy

In [29]:

# Prepare the data (numeric columns only)
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']
test_size = 0.3

for test_col in columns:
    # Step 1: Prepare the test dataset
    raw_data_copy = raw_data.copy()
    raw_data_copy.dropna(inplace=True)
    raw_data_copy.reset_index(drop=True, inplace=True)

    #Train Test split
    train_col = columns.copy()
    train_col.remove(test_col)

    X = raw_data_copy[train_col].values
    y = raw_data_copy[test_col].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    #Train model
    decision_tree = DecisionTreeRegressor(max_depth=5, random_state=42)
    decision_tree.fit(X_train, y_train)

    y_pred = decision_tree.predict(X_test)
        
    #Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Summary of {test_col}:')
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R² Score:', r2)
    print('--------\n')

Summary of Tomatoes CriticScore:
Mean Absolute Error: 7.835627985849604
Mean Squared Error: 112.20334479145103
R² Score: 0.8545571626458768
--------

Summary of Tomatoes UserScore:
Mean Absolute Error: 9.902754603179579
Mean Squared Error: 171.0411980186958
R² Score: 0.5896257803637216
--------

Summary of Metascore:
Mean Absolute Error: 6.14279581338172
Mean Squared Error: 68.55870330237295
R² Score: 0.8058853347446275
--------

Summary of Meta UserScore:
Mean Absolute Error: 0.6125306144772517
Mean Squared Error: 0.6859065033066831
R² Score: 0.5545997041376811
--------



The loss seemed fairly low, good stuff