## Loading the dataset

In [None]:
import pandas as pd 

movies = pd.read_csv('../data/ml-32m/movies.csv')
ratings = pd.read_csv('../data/ml-32m/ratings.csv')
tags = pd.read_csv('../data/ml-32m/tags.csv')

merged = pd.merge(movies , ratings , on = 'movieId')
merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,2.5,1169265231
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,3.0,850085076
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.0,1027305751
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,974704488
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,20,5.0,1553184230


## SVD model 

In [14]:
import joblib
import os

In [None]:
def svd_model(df , model_path = 'svd_model.pkl'):
    if os.path.exists(model_path):
        model = joblib.load(model_path)
        print('Model loaded and ready.')
    else:
        print('Model not found. Training a new one...')
        from surprise import Dataset, Reader
        from surprise import SVD
        from surprise.model_selection import train_test_split
        from surprise import accuracy
        
        reader = Reader(rating_scale=(0, 5))  
        data = Dataset.load_from_df(df[['userId', 'title', 'rating']], reader)
        
        trainset, testset = train_test_split(data, test_size=0.2)
        
        model = SVD()
        model.fit(trainset)
        
        predictions = model.test(testset)
        
        rmse = accuracy.rmse(predictions)
        print(f'RMSE: {rmse}')
        joblib.dump(model, 'svd_model.pkl')

    return model

## Recommend with this model 

In [41]:
def recommend (df , model , user_id):
    movies = df['title'].unique()
    movie_watched = df[df['userId'] == user_id].title.values
    new_movies = [movie for movie in movies if movie not in movie_watched]
    predictions = {}
    for movie in new_movies:
        predict = model.predict (user_id , movie).est
        predictions [movie] = predict.round(2)
    sorted_movies = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
    return sorted_movies

In [None]:
%%time
model = svd_model (merged)

In [44]:
%%time
recommend (merged , model , 674)

CPU times: total: 5.81 s
Wall time: 5.83 s


[('Memories of Matsuko (Kiraware Matsuko no isshô) (2006)', 3.43),
 ('Connections (1978)', 3.38),
 ('La Soufrière - Warten auf eine unausweichliche Katastrophe (1977)', 3.31),
 ('Organizer, The (I compagni) (1963)', 3.26),
 ('Alone in the Wilderness (2004)', 3.24),
 ('How to Steal a Million (1966)', 3.23),
 ('North & South (2004)', 3.22),
 ('Meet Me in St. Louis (1944)', 3.21),
 ('Top Gun: Maverick (2022)', 3.21),
 ('Mike Birbiglia: What I Should Have Said Was Nothing (2008)', 3.19),
 ('House Is Black, The (1963)', 3.18),
 ('Drishyam (2015)', 3.18),
 ('Sorrow and the Pity, The (Le chagrin et la pitié) (1969)', 3.16),
 ('Animals are Beautiful People (1974)', 3.15),
 ('Fishing with John (1991)', 3.15),
 ('Mission: Impossible - Fallout (2018)', 3.14),
 ('Long Way Round (2004)', 3.13),
 ('Die Hard (1988)', 3.12),
 ('Newsies (1992)', 3.12),
 ('The Adventures of Sherlock Holmes and Dr. Watson: The Hound of the Baskervilles (1981)',
  3.12),
 ('Sherlock: The Blind Banker', 3.12),
 ('Few Good 

## Content based 

In [2]:
merged2 = pd.merge (movies , tags , on = 'movieId')

In [7]:
tagged_df = merged2.groupby('movieId')['tag'] \
    .agg(lambda tags: '|'.join(str(tag) for tag in set(tags) if pd.notnull(tag))) \
    .reset_index()

In [8]:
tagged_df

Unnamed: 0,movieId,tag
0,1,low fantasy|surgical mask|lifting a female int...
1,2,based on a book|construction site|brother sist...
2,3,Funniest Movies|best friend|sequel fever|grun ...
3,4,revenge|divorce|chick flick|single mother|char...
4,5,sentimental|parent child relationship|growing ...
...,...,...
51318,292143,catalonia|village|replica|no narrator|seaside|...
51319,292349,politically incorrect
51320,292371,Stephen King
51321,292597,artificial intelligence


In [10]:
combined = pd.merge(tagged_df , movies , on = 'movieId')
combined ['text_features'] = final ['tag'] + '|' + final ['genres']
combined

Unnamed: 0,movieId,tag,title,genres,text_features
0,1,low fantasy|surgical mask|lifting a female int...,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,low fantasy|surgical mask|lifting a female int...
1,2,based on a book|construction site|brother sist...,Jumanji (1995),Adventure|Children|Fantasy,based on a book|construction site|brother sist...
2,3,Funniest Movies|best friend|sequel fever|grun ...,Grumpier Old Men (1995),Comedy|Romance,Funniest Movies|best friend|sequel fever|grun ...
3,4,revenge|divorce|chick flick|single mother|char...,Waiting to Exhale (1995),Comedy|Drama|Romance,revenge|divorce|chick flick|single mother|char...
4,5,sentimental|parent child relationship|growing ...,Father of the Bride Part II (1995),Comedy,sentimental|parent child relationship|growing ...
...,...,...,...,...,...
51318,292143,catalonia|village|replica|no narrator|seaside|...,La substància (2016),(no genres listed),catalonia|village|replica|no narrator|seaside|...
51319,292349,politically incorrect,Totally Killer (2023),Comedy|Horror,politically incorrect|Comedy|Horror
51320,292371,Stephen King,Pet Sematary: Bloodlines (2023),Fantasy|Horror,Stephen King|Fantasy|Horror
51321,292597,artificial intelligence,The Mill (2023),Horror|Sci-Fi,artificial intelligence|Horror|Sci-Fi


In [11]:
final = combined [['movieId' , 'text_features']]
final

Unnamed: 0,movieId,text_features
0,1,low fantasy|surgical mask|lifting a female int...
1,2,based on a book|construction site|brother sist...
2,3,Funniest Movies|best friend|sequel fever|grun ...
3,4,revenge|divorce|chick flick|single mother|char...
4,5,sentimental|parent child relationship|growing ...
...,...,...
51318,292143,catalonia|village|replica|no narrator|seaside|...
51319,292349,politically incorrect|Comedy|Horror
51320,292371,Stephen King|Fantasy|Horror
51321,292597,artificial intelligence|Horror|Sci-Fi


In [18]:
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()

True

In [19]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [22]:
final['embedding'] = final['text_features'].apply(
    lambda x: client.embeddings.create(model="text-embedding-3-small", input=x).data[0].embedding
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['embedding'] = final['text_features'].apply(


In [23]:
final

Unnamed: 0,movieId,text_features,embedding
0,1,low fantasy|surgical mask|lifting a female int...,"[0.012040429748594761, 0.013948115520179272, -..."
1,2,based on a book|construction site|brother sist...,"[0.018080798909068108, 0.06331375241279602, -0..."
2,3,Funniest Movies|best friend|sequel fever|grun ...,"[0.0014785799430683255, 0.04591622203588486, -..."
3,4,revenge|divorce|chick flick|single mother|char...,"[0.02682194672524929, 0.06319113820791245, -0...."
4,5,sentimental|parent child relationship|growing ...,"[0.024181121960282326, 0.033560220152139664, -..."
...,...,...,...
51318,292143,catalonia|village|replica|no narrator|seaside|...,"[0.03421638906002045, 0.024409234523773193, -0..."
51319,292349,politically incorrect|Comedy|Horror,"[-0.024666042998433113, 0.022995492443442345, ..."
51320,292371,Stephen King|Fantasy|Horror,"[0.014681873843073845, 0.030048822984099388, -..."
51321,292597,artificial intelligence|Horror|Sci-Fi,"[-0.02497914992272854, 0.030762914568185806, -..."


In [24]:
final.to_csv('final.csv', index=False)

In [2]:
final = pd.read_csv ('final.csv')

In [15]:
final = final.merge(movies[['movieId', 'title']], on='movieId', how='left')
final

Unnamed: 0,movieId,text_features,embedding,title
0,1,low fantasy|surgical mask|lifting a female int...,"[0.012040429748594761, 0.013948115520179272, -...",Toy Story (1995)
1,2,based on a book|construction site|brother sist...,"[0.018080798909068108, 0.06331375241279602, -0...",Jumanji (1995)
2,3,Funniest Movies|best friend|sequel fever|grun ...,"[0.0014785799430683255, 0.04591622203588486, -...",Grumpier Old Men (1995)
3,4,revenge|divorce|chick flick|single mother|char...,"[0.02682194672524929, 0.06319113820791245, -0....",Waiting to Exhale (1995)
4,5,sentimental|parent child relationship|growing ...,"[0.024181121960282326, 0.033560220152139664, -...",Father of the Bride Part II (1995)
...,...,...,...,...
51318,292143,catalonia|village|replica|no narrator|seaside|...,"[0.03421638906002045, 0.024409234523773193, -0...",La substància (2016)
51319,292349,politically incorrect|Comedy|Horror,"[-0.024666042998433113, 0.022995492443442345, ...",Totally Killer (2023)
51320,292371,Stephen King|Fantasy|Horror,"[0.014681873843073845, 0.030048822984099388, -...",Pet Sematary: Bloodlines (2023)
51321,292597,artificial intelligence|Horror|Sci-Fi,"[-0.02497914992272854, 0.030762914568185806, -...",The Mill (2023)


In [None]:
import numpy as np

import ast
final['embedding'] = final['embedding'].apply(ast.literal_eval)
emb_matrix = np.array(final['embedding'].to_list(), dtype=float)

In [17]:
def recommend(movie_id, n=5):
    # Find movie index
    idx = final.index[final['movieId'] == movie_id][0]
    
    # Compute cosine similarities
    target = emb_matrix[idx]
    norms = np.linalg.norm(emb_matrix, axis=1) * np.linalg.norm(target)
    sims = np.dot(emb_matrix, target) / norms
    
    # Get top-N similar movies (excluding itself)
    top_idx = np.argsort(sims)[::-1][1:n+1]
    return final.iloc[top_idx][['movieId', 'title' , 'text_features']]

In [18]:
# Example
recommend(1, 5)

Unnamed: 0,movieId,title,text_features
2843,3114,Toy Story 2 (1999),low fantasy|piggy bank|slinky dog|avi|mistaken...
45251,201588,Toy Story 4 (2019),low fantasy|woody character|Disney|Pixar|piggy...
14021,78499,Toy Story 3 (2010),garbage chute|piggy bank|brother sister relati...
11397,56176,Alvin and the Chipmunks (2007),winter|live action cgi hybrid|family friendly|...
1689,1920,Small Soldiers (1998),attack|irreverence|live action and animation|b...


In [23]:
recommend(187593, 10)

Unnamed: 0,movieId,title,text_features
22641,122904,Deadpool (2016),snow|construction site|character says i love y...
18530,104241,Kick-Ass 2 (2013),needless violence|smoke grenade|kicked in the ...
36701,168252,Logan (2017),poisoning|lifting a female into the air|manhun...
12607,68319,X-Men Origins: Wolverine (2009),child in peril|set up|ororo munroe character|a...
43985,192389,Venom (2018),humor|manhunt|program|biohazard suit|specimen|...
46240,210271,6 Underground (2019),no story|air strike|afghanistan|close up of ey...
18445,103772,"Wolverine, The (2013)",snow|thrown off a balcony|stabbed in the stoma...
18156,102125,Iron Man 3 (2013),child in peril|snow|child genius|toy gun|power...
6931,7439,"Punisher, The (2004)",organized crime|character says i love you|crut...
22651,122924,X-Men: Apocalypse (2016),snow|no story|Oscar Isaac|group name in title|...
