## Loading the dataset

In [37]:
import pandas as pd 

movies = pd.read_csv('../data/ml-32m/movies.csv')
ratings = pd.read_csv('../data/ml-32m/ratings.csv')
tags = pd.read_csv('../data/ml-32m/tags.csv')

merged = pd.merge(movies , ratings , on = 'movieId')
merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,2.5,1169265231
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,3.0,850085076
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.0,1027305751
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,974704488
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,20,5.0,1553184230


## SVD model 

In [38]:
import joblib
import os

def svd_model(df , model_path = 'svd_model.pkl'):
    if os.path.exists(model_path):
        model = joblib.load(model_path)
        print('Model loaded and ready.')
    else:
        print('Model not found. Training a new one...')
        from surprise import Dataset, Reader
        from surprise import SVD
        from surprise.model_selection import train_test_split
        from surprise import accuracy
        
        reader = Reader(rating_scale=(0, 5))  
        data = Dataset.load_from_df(df[['userId', 'title', 'rating']], reader)
        
        trainset, testset = train_test_split(data, test_size=0.2)
        
        model = SVD()
        model.fit(trainset)
        
        predictions = model.test(testset)
        
        rmse = accuracy.rmse(predictions)
        print(f'RMSE: {rmse}')
        joblib.dump(model, 'svd_model.pkl')

    return model

## Recommend with this model 

In [41]:
def recommend (df , model , user_id):
    movies = df['title'].unique()
    movie_watched = df[df['userId'] == user_id].title.values
    new_movies = [movie for movie in movies if movie not in movie_watched]
    predictions = {}
    for movie in new_movies:
        predict = model.predict (user_id , movie).est
        predictions [movie] = predict.round(2)
    sorted_movies = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
    return sorted_movies

In [None]:
%%time
model = svd_model (merged)

In [44]:
%%time
recommend (merged , model , 674)

CPU times: total: 5.81 s
Wall time: 5.83 s


[('Memories of Matsuko (Kiraware Matsuko no isshô) (2006)', 3.43),
 ('Connections (1978)', 3.38),
 ('La Soufrière - Warten auf eine unausweichliche Katastrophe (1977)', 3.31),
 ('Organizer, The (I compagni) (1963)', 3.26),
 ('Alone in the Wilderness (2004)', 3.24),
 ('How to Steal a Million (1966)', 3.23),
 ('North & South (2004)', 3.22),
 ('Meet Me in St. Louis (1944)', 3.21),
 ('Top Gun: Maverick (2022)', 3.21),
 ('Mike Birbiglia: What I Should Have Said Was Nothing (2008)', 3.19),
 ('House Is Black, The (1963)', 3.18),
 ('Drishyam (2015)', 3.18),
 ('Sorrow and the Pity, The (Le chagrin et la pitié) (1969)', 3.16),
 ('Animals are Beautiful People (1974)', 3.15),
 ('Fishing with John (1991)', 3.15),
 ('Mission: Impossible - Fallout (2018)', 3.14),
 ('Long Way Round (2004)', 3.13),
 ('Die Hard (1988)', 3.12),
 ('Newsies (1992)', 3.12),
 ('The Adventures of Sherlock Holmes and Dr. Watson: The Hound of the Baskervilles (1981)',
  3.12),
 ('Sherlock: The Blind Banker', 3.12),
 ('Few Good 

## Content based 

In [48]:
merged2 = pd.merge (movies , tags , on = 'movieId')

In [56]:
tagged_df = merged2.groupby('movieId')['tag'] \
    .agg(lambda tags: ' | '.join(str(tag) for tag in set(tags) if pd.notnull(tag))) \
    .reset_index()

In [69]:
tagged_df

Unnamed: 0,movieId,tag
0,1,animated fictional tv commercial | forced puns...
1,2,car crashes into a store | 20th century | new ...
2,3,Walter Matthau | Minnesota | comedinha de velh...
3,4,slurs | divorce | interracial relationship | s...
4,5,growing old | midlife crisis | remake | Fantas...
...,...,...
51318,292143,husband wife relationship | China | Cadaqués |...
51319,292349,politically incorrect
51320,292371,Stephen King
51321,292597,artificial intelligence
