# Table of contents
1. [Libraries](#libraries)
2. [Numerical columns](#numerical)
3. [Categorical columns](#categorical)



<h1 id = "libraries"> 1. Libraries </h1>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from scipy.stats import sem

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
raw_data = pd.read_csv('../Data/retyped_data.csv')

<div id = "numerical"> <h1>Evalutation </h1> </div>

#### This Section is dedicated to evaluate imputation using similarity strategy

+ Function to fill missing using similarity strategy

In [3]:

batch_size = 32
start = 0
end = 32

# Define the function to calculate similarities
def calculate_similarities(ratings, batch_start, batch_end):
    # Select the batch of users
    batch_ratings = ratings[batch_start:batch_end]
    
    # Calculate the absolute difference between the batch and all users
    abs_diff = np.abs(ratings - batch_ratings.reshape(batch_end - batch_start, 1, ratings.shape[1]))
    
    # Calculate the mean absolute difference across movies, ignoring NaN values
    mean_diff = np.nanmean(abs_diff, axis=2)
    
    # Compute similarity as the inverse of the mean absolute difference
    similarities = 1 / (mean_diff + 0.001)  # Adding a small epsilon to avoid division by zero
    similarities[np.isnan(similarities)] = 0
    return similarities

def fill_missing(data, batch_size = 32):
    n_movies = data.shape[0]
    filled_ratings = np.empty_like(data)
    num_batches = int(np.ceil(n_movies / batch_size))

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, n_movies)

        similarities = calculate_similarities(data, start, end)
        
        weights = ~np.isnan(data) * similarities.reshape(end - start, -1, 1)
        weights /= weights.sum(axis=1, keepdims=True)

        filled_ratings[start:end] = np.nansum(data * weights, axis=1)

    return filled_ratings



+ Evaluate each columns

In [4]:
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']

test_size = 0.3

#Evalue each column seperatly
for test_col in columns:
    #Get a copy of data but remove all null value for testing
    raw_data_copy = raw_data.copy()
    raw_data_copy.dropna(inplace=True)
    raw_data_copy.reset_index(drop=True, inplace=True)

    #Get sample with size of 30%
    test_rows = raw_data_copy.sample(frac=test_size, random_state=42).index

    #Get y true value from the dataset
    y_test = raw_data_copy.loc[test_rows, test_col].copy()

    #Assign it's as nan value for imputing
    raw_data_copy.loc[test_rows, test_col] = np.nan

    #Perform imputing missing value using similarity
    tmp_data = raw_data_copy.copy()
    tmp_data['id'] = tmp_data.index
    tmp_data['Meta UserScore'] = tmp_data['Meta UserScore'] * 10

    tmp_data = tmp_data[['id','Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']].to_numpy()

    filled_ratings = fill_missing(tmp_data)
    filled_nanvals = filled_ratings[np.isnan(tmp_data)]

    tmp_data[np.isnan(tmp_data)] = filled_nanvals

    filled_df = pd.DataFrame(
        filled_ratings[:, 1:],
        columns=['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']
    )

    filled_df['Meta UserScore'] /= 10
    tmp_data_2 = raw_data_copy.copy()

    for col in filled_df.columns:
        tmp_data_2[col].fillna(filled_df[col], inplace=True)

    #Get y predicted (column after imputing)
    y_pred = tmp_data_2.loc[test_rows, test_col]

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Sumary of {test_col}: ')
    print('Mean absolute error: ', mae)
    print('Mean squared error: ', mse)
    print('R2 score: ', r2)
    print("--------\n")

Sumary of Tomatoes CriticScore: 
Mean absolute error:  19.754701330716717
Mean squared error:  525.6943104026549
R2 score:  0.31772289189241265
--------

Sumary of Tomatoes UserScore: 
Mean absolute error:  14.239382392822442
Mean squared error:  289.0440282119497
R2 score:  0.3071021599944561
--------

Sumary of Metascore: 
Mean absolute error:  12.497650376541852
Mean squared error:  232.4710433204248
R2 score:  0.3416715140083487
--------

Sumary of Meta UserScore: 
Mean absolute error:  0.8212265709625988
Mean squared error:  1.084668638219071
R2 score:  0.29630987159548206
--------



The Loss seem too high, suggest using other imputation strategy (KNN, Decision Tree)