# Imports 

In [86]:
import numpy as np
import pandas as pd
import polars as pl
import time
from src.Latent_Factor_Updates import Update_movie_factors_with_features, Update_user_biases, Update_user_factors, Update_movie_biases, update_feature_factors, calc_metrics
from src.Helper_Functions import load_model, Load_idx_maps,create_fake_user ,setup_logging, setup_experiment_folder, Load_training_data, Load_idx_maps, Load_test_data, get_possible_movie_indices


In [100]:

def create_idx_to_title(movies_csv_path, movie_idx_map):
    # Load the movies.csv file into a DataFrame
    movies_df = pd.read_csv(movies_csv_path)

    # Ensure columns 'movieId' and 'title' exist in the DataFrame
    if 'movieId' not in movies_df.columns or 'title' not in movies_df.columns:
        raise ValueError("The CSV file must contain 'movieId' and 'title' columns.")

    # Create a mapping from movie_id to title
    movie_id_to_title = dict(zip(movies_df['movieId'], movies_df['title']))

    # Create the idx_to_title dictionary using movie_idx_map

    idx_to_title = {idx: movie_id_to_title[movie_id] for movie_id, idx in movie_idx_map.items() if movie_id in movie_id_to_title}

    return idx_to_title
movies_csv_path = "Data/ml-32m/movies.csv"  # Path to the movies.csv file
idx_to_title = create_idx_to_title(movies_csv_path, movie_idx_map)

## Polarizing Movies By rating standard deviation

A polarizing movie has a wide spread of ratings, meaning some users rate it very high, while others rate it very low. Statistically, you can define polarization using the standard deviation of ratings.

    High standard deviation of ratings → High polarization.
    Low standard deviation of ratings → Low polarization.

In [None]:
source_data_folder = "Training_data/ml-32m"

users_train, movies_train, movies_train_idxes, users_train_idxes, movies_genres_array = Load_training_data(source_data_folder)
users_test, movies_test, users_test_idxes, movies_test_idxes = Load_test_data(source_data_folder)
user_idx_map, movie_idx_map, idx_to_user, idx_to_movie, genre_to_idx, specific_indices = Load_idx_maps(source_data_folder)

In [78]:
ratings = pd.read_csv("Data/ml-32m/ratings.csv")

In [94]:
movies_train_ratings = movies_train[:, 1]

# Initialize lists to store mean and standard deviation
movies_mean = []
movies_std = []

# Process train ratings
for rate in movies_train_ratings:
    # Convert to float if necessary
    movies_mean.append(float(np.mean(rate)))
    movies_std.append(float(np.std(rate)))



movies_mean = np.array(movies_mean)
movies_std = np.array(movies_std)
print("Movies Mean:", movies_mean[:10])
print("Movies Std:", movies_std[:10])
movies_std.shape


Movies Mean: [3.94334483 3.67890239 3.92748404 3.63284612 3.90872955 3.58488035
 3.92296505 3.76151013 3.98983526 4.07776451]
Movies Std: [0.96912217 1.03243673 0.98380136 0.97880602 0.85970008 1.1091882
 0.90053898 1.03916454 0.96618515 0.85426992]


(82486,)

In [91]:
# Get the indexes of the top 50 movies with the highest standard deviation
most_polarizing_movies_idx_std = np.argsort(movies_std)[-50:][::-1]  # Sort in descending order
least_polarizing_movies_idx_std = np.argsort(movies_std)[:50]# Sort in descending order

# Print the top 50 indexes
print("Top polarizing_movies indexes by std:", most_polarizing_movies_idx_std)
print("Least polarizing movie indexes by std:", least_polarizing_movies_idx_std)


Top polarizing_movies indexes by std: [65160 74306 52180 46898 78432 49242 65119 65098 65318 65316 65342 65090
 62663 67589 67586 42894 16788 61696 46555 65490 59152 71225 55038 76972
 71319 65378 49444 40641 40640 51462 51521 52833 17667 61465 61446 51407
 78569 37140 73420 74069 44092 44128 58616 64970 51445 61380 68610 68646
 51395 61174]
Least polarizing movie indexes by std: [37288 63506 63505 63504 63503 63513 63512 63511 63529 37291 37289 63540
 37286 37285 37284 37283 37314 37263 37259 63590 63587 63569 63487 63498
 63497 63496 63495 63494 63491 63461 63460 63489 63488 63567 37258 37256
 37254 63525 63523 63554 63550 63549 63546 63542 63654 63640 63637 63636
 63615 63605]


In [93]:
most_polarizing_movie_names_std = [idx_to_title[idx] for idx in most_polarizing_movies_idx_std]
least_polarizing_movie_names_std = [idx_to_title[idx] for idx in least_polarizing_movies_idx_std]

# Print the movie names
print("Most Polarizing Movies by std:", most_polarizing_movie_names_std)
print("Least Polarizing Movies by std:", least_polarizing_movie_names_std)

Most Polarizing Movies by std: ["Bernard Herrmann: Hitchcock's Maestro (2008)", 'The Girl Who Escaped: The Kara Robinson Story (2023)', 'The Marchers (2013)', 'The Hat (1999)', 'Senior Entourage (2021)', 'Otaku (1994)', 'Flowers for Madame (1935)', 'Sunshine Dad (1916)', 'On the Hook! (2011)', 'Butterfly Kiss (2006)', 'The Most Reluctant Convert: The Untold Story of C.S. Lewis (2021)', 'The Prophetess of Thebes (1907)', 'David and the Elves (2021)', '4 Wedding Planners (2011)', 'The Pulitzer At 100 (2017)', 'The Fox and the Hare (1973)', 'We Can Do That (2008)', 'Like Two Drops of Water (1963)', 'The Love Goddesses (1965)', 'Up to the World (2014)', 'Nightmare (1961)', "'Master Harold' ... And the Boys (2010)", 'Snow Queen, The (Lumikuningatar) (1986)', 'Escape from East Berlin (1962)', 'Naked (2002)', "I Don't Speak English (1995)", 'Chasers, The (Jakten) (1959)', 'Hollidaysburg (2014)', 'The Midnight Game (2013)', 'Flying Dagger (1993)', 'Man of the Moment (1955)', 'Late Bloomer (201

## Polarizing Movies By Trait vector length
A longer vector indicates that the movie has more pronounced or extreme features along its latent dimensions. For instance:

    A movie with a long vector may exhibit strong genre characteristics (e.g., extremely action-packed, heavily comedic).
    A movie with a short vector may be more neutral or generic in its features (e.g., not strongly defined by any particular trait).


    Long trait vectors: May correspond to movies that are highly distinctive or "different" from most others, which could align with being polarizing (since they appeal strongly to some but not to others).
    Short trait vectors: May correspond to movies that are generic or broadly appealing, likely to be less polarizing.

In [96]:
movies_factors,_,_,_,_ = load_model("Experiments_ml-32m/B_U_V_F/")

In [97]:
# Calculate the Euclidean norm (vector length) for each movie
vector_lengths = np.linalg.norm(movies_factors, axis=1)
vector_lengths.shape


(84432,)

In [98]:

most_polarizing_movies_idx = np.argsort(vector_lengths)[-50:][::-1]  # Sort in descending order
least_polarizing_movies_idx = np.argsort(vector_lengths)[:50]# Sort in descending order

# Print the top 50 indexes
print("Top polarizing_movies indexes by std:", most_polarizing_movies_idx)
print("Least polarizing movie indexes by std:", least_polarizing_movies_idx)

Top polarizing_movies indexes by std: [ 613  339  596  309  332  163  153  609  583  324  655  618   15   53
   60  344  159   52 1439  611  628 1440  749 1441  291   88  279  287
  675  515  709  514 1194    1  563  831    5  306 1063  466  127  400
  369  212  517  888  204  446   21  216]
Least polarizing movie indexes by std: [84129 83959 61699 61714 61601 83369 38662 38721 38713 38400 82686 83930
 83893 76018 39608 69124 73459 84335 77742 82904 82906 73220 49908 80798
 80790 76168 77558 77656 77524 77486 77671 77450 77539 77667 77550 77497
 77517 77510 77476 72011 76026 68691 68757 68777 68801 68848 68666 68873
 68782 68780]


In [99]:
vector_lengths[most_polarizing_movies_idx]

array([11.63503738, 11.48841019, 11.47644284, 10.42669243, 10.11230646,
        9.90327349,  9.88377378,  9.84320863,  9.83150787,  9.79786509,
        9.72775475,  9.70289757,  9.55487253,  9.17080817,  9.13229826,
        8.978521  ,  8.66635038,  8.63592142,  8.60576906,  8.55018878,
        8.53313517,  8.53299896,  8.50105554,  8.48808288,  8.46221758,
        8.45855006,  8.45674578,  8.43765739,  8.39659008,  8.30224411,
        8.26087706,  8.20884132,  8.19220405,  8.07084877,  8.06059767,
        8.05964855,  8.05816241,  8.02397172,  7.9735171 ,  7.93744893,
        7.92391954,  7.86329425,  7.85128971,  7.80808141,  7.77592204,
        7.73171296,  7.68354953,  7.66852107,  7.6412049 ,  7.64087123])

In [101]:
most_polarizing_movie_names_length= [idx_to_title[idx] for idx in most_polarizing_movies_idx]

# Print the movie names
print("Most Polarizing Movies by length:", most_polarizing_movie_names_length)

Most Polarizing Movies by length: ['Lord of the Rings: The Return of the King, The (2003)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)', 'Lord of the Rings: The Two Towers, The (2002)', 'Blair Witch Project, The (1999)', 'Star Wars: Episode I - The Phantom Menace (1999)', 'Ace Ventura: Pet Detective (1994)', 'Dumb & Dumber (Dumb and Dumber) (1994)', 'Kill Bill: Vol. 1 (2003)', 'Star Wars: Episode II - Attack of the Clones (2002)', 'Natural Born Killers (1994)', 'Star Wars: Episode III - Revenge of the Sith (2005)', 'Kill Bill: Vol. 2 (2004)', 'Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Ace Ventura: When Nature Calls (1995)', 'Pulp Fiction (1994)', 'English Patient, The (1996)', 'Harry Potter and the Half-Blood Prince (2009)', 'Matrix Revolutions, The (2003)', 'Harry Potter and the Prisoner of Azkaban (2004)', 'Harry Potter and the Deathly Hallows: Part 1 (2010