In [61]:
from sklearn.decomposition import TruncatedSVD
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
 
from adjustText import adjust_text
%matplotlib macosx

In [62]:
plt.rcParams.update({
    'font.family': 'Times New Roman',
    'savefig.dpi': 300,
    'figure.dpi': 300,
    'font.weight': 'bold',
    'axes.labelweight': 'bold'
})

In [63]:
def visualize_latent_factors(V, ids, titles, method_name, plot_title):
    
    points = V[ids, :]
    
    # plt.figure(figsize=(12, 10))
    fig, ax = plt.subplots(figsize=(5, 5))
    
    ax.scatter(points[:, 0], points[:, 1], s=100, alpha=0.7)
    
    texts = []
    for i, (x, y) in enumerate(points):
        short_title = titles[ids[i]][:25] + '...' if len(titles[ids[i]]) > 25 else titles[ids[i]]
        texts.append(ax.text(x, y, short_title, ha='center', va='center', weight=400, fontsize = 6))
        # ax.text(x[i], y[i], 'Text%s' %i, ha='center', va='center') for i in range(len(x))
        # plt.annotate(short_title, (x, y), fontsize=9, xytext=(5, 5), textcoords='offset points')
    adjust_text(texts, arrowprops={'arrowstyle': '-', 'color': 'black'})
    
    plt.title(plot_title)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.grid(True, linestyle='--', alpha=0.7)
    # plt.xlim(0, 2)
    # plt.ylim(-2, 2)
    
    
    filename = f"{method_name}_{plot_title.replace(' ', '_').lower()}.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show()

In [64]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
movies_df = pd.read_csv('../data/movies.csv', encoding='latin-1')

In [72]:
movies_df

Unnamed: 0,Movie ID,Movie Title,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


: 

In [66]:
movie_titles = {int(row['Movie ID']) - 1: row['Movie Title'] for _, row in movies_df.iterrows()}


any10 = [1]

popularity = train_df.groupby('Movie ID').size()
most_popular = (popularity.sort_values(ascending=False).head(20).index - 1).tolist()

avg_ratings = train_df.groupby('Movie ID')['Rating'].mean()
best_movies = (avg_ratings.sort_values(ascending=False).head(20).index - 1).tolist()

genres = ["Horror", "Comedy", "Childrens"]
genre_movies = {}
for genre in genres:
    subset = movies_df[movies_df[genre] == 1]
    indices = (subset['Movie ID'] - 1).head(20).tolist()
    print(indices)
    genre_movies[genre] = indices

[16, 83, 100, 122, 182, 183, 184, 199, 200, 207, 216, 218, 233, 287, 306, 342, 350, 352, 378, 395]
[0, 3, 7, 12, 15, 16, 20, 24, 25, 28, 33, 39, 40, 41, 44, 46, 48, 62, 64, 65]
[0, 7, 34, 62, 70, 77, 90, 93, 94, 98, 101, 102, 111, 131, 137, 138, 139, 140, 141, 150]


In [67]:
print(any10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [68]:
# V = np.load("scratch_bias.npy")
V = np.load("scratch_bias.npy")
print(V.shape)
svd = TruncatedSVD(n_components=2, n_iter=20)
V = svd.fit_transform(V)
print(V.shape)

(1682, 20)
(1682, 2)


In [69]:
visualize_latent_factors(V, any10, movie_titles, "from_scratch", "plot")