In [1]:
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
import re
import ast


In [2]:

movies_df = pd.read_csv('ml-25m/movies.csv')
ratings_df = pd.read_csv('ml-25m/ratings.csv')
links_df = pd.read_csv('ml-25m/links.csv')


In [3]:
ratings_df.shape

(1048575, 2)

<h3><b><u> PreProcessing </h3></b></u></b>
- No duplicates in either files </b>
- Only nulls in links.csv - 107 nulls in tmdbId </b>

In [4]:
def preprocessing_dataset(dfName, moviedf):
    moviedf_head = moviedf.head()
    moviedf_duplicates = moviedf.duplicated().sum()
    moviedf_null= moviedf.isnull().sum()
    moviedf_shape = links_df.shape
    print(f"{dfName}\n{moviedf_head}\n Duplicates: {moviedf_duplicates}\n Nulls: {moviedf_null}\n Shape: {moviedf_shape} \n \n")


In [5]:
movies_dataset = [movies_df, ratings_df, links_df]
print(preprocessing_dataset("Movie Names DF",movies_df))
print(preprocessing_dataset("Ratings DF",ratings_df))
print(preprocessing_dataset("Links DF",links_df))

Movie Names DF
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
 Duplicates: 0
 Nulls: movieId    0
title      0
genres     0
dtype: int64
 Shape: (62423, 3) 
 

None
Ratings DF
   movieId  rating
0      296     5.0
1      306     3.5
2      307     5.0
3      665     5.0
4      899     3.5
 Duplicates: 960477
 Nulls: movieId    0
rating     0
dtype: int64
 Shape: (62423, 3) 
 

None
Links DF
   movieId  imdbId   tmdbId
0        1  114709    862.0
1 

In [6]:
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)', expand=False)
movies_df['title'] = movies_df['title'].apply(lambda x: re.sub(r'\(\d{4}\)', '', x).strip())
movies_df['genres'] = movies_df['genres'].str.split('|')
movies_df.head()


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [7]:
ratings_df.shape

(1048575, 2)

In [8]:
# Ratings has 88098 entries with 22240 unique movie Ids. Some movies missing ratings, pad with 0
print("Unique movie Ids with ratings is",ratings_df['movieId'].nunique())
average_ratings = ratings_df.groupby('movieId')['rating'].mean().round(2)
ratings_df['rating'] = ratings_df['movieId'].map(average_ratings)
ratings_df.drop_duplicates(subset=['movieId', 'rating'], inplace=True)
print(f"New ratings shape: {ratings_df.shape}")
ratings_df.head()



Unique movie Ids with ratings is 22240


New ratings shape: (22240, 2)


Unnamed: 0,movieId,rating
0,296,4.18
1,306,4.09
2,307,3.95
3,665,3.94
4,899,4.06


In [9]:
merged_movies_df = pd.merge(movies_df,links_df, on='movieId')
merged_movies_df = merged_movies_df.merge(ratings_df,on='movieId')
merged_movies_df

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId,rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,114709,862.0,3.90
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,113497,8844.0,3.31
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,113228,15602.0,3.17
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,114885,31357.0,2.94
4,5,Father of the Bride Part II,[Comedy],1995,113041,11862.0,3.13
...,...,...,...,...,...,...,...
22235,208112,Rudolph the Red-Nosed Reindeer & the Island of...,"[Animation, Children]",2001,293913,43575.0,5.00
22236,208737,Midway,"[Action, Drama, War]",2019,6924650,522162.0,3.50
22237,208793,Watchman,"[Drama, Thriller]",2019,10310222,575184.0,3.50
22238,208939,Klaus,"[Adventure, Animation, Children, Comedy]",2019,4729430,508965.0,4.00


In [10]:
null_ratings_count = ratings_df['rating'].isnull().sum()
null_ratings_count

0

<h1><u><b> Embeddings 

In [11]:
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings('glove.6B.50d.txt')

In [12]:

def parse_genres(genres_str):
    # Check if genres_str is already a list
    if isinstance(genres_str, list):
        return genres_str
    # Check if the genres string is empty or not formatted as a list
    elif genres_str == "[]" or genres_str == "" or not genres_str.startswith("["):
        return []
    try:
        # Attempt to parse the string as a list
        genres = ast.literal_eval(genres_str)
        # Check if the result is a list
        if isinstance(genres, list):
            return genres
        else:
            return []
    except (ValueError, SyntaxError):
        # Return an empty list if there's an error in parsing
        return []



In [13]:

def get_movie_embedding(movie_title, movie_tags, embeddings):
    words = movie_title.lower().split() + [tag.lower() for tag in movie_tags]
    # Initialize an empty list for embeddings
    vectors = []
    for word in words:
        # Check if the word is in the embeddings
        if word in embeddings:
            vectors.append(embeddings[word])
        else:
            # If not, append a zeros vector of the same length as the other embeddings
            vectors.append(np.zeros_like(list(embeddings.values())[0]))
    return np.mean(vectors, axis=0)

merged_movies_df['genres'] = merged_movies_df['genres'].apply(parse_genres)

merged_movies_df['embedding'] = merged_movies_df.apply(lambda x: get_movie_embedding(x['title'], x['genres'], glove_embeddings), axis=1)




In [18]:
merged_movies_df.head()

Unnamed: 0,movieId,title,genres,year,imdbId,tmdbId,rating,embedding
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,114709,862.0,3.9,"[ 0.21296571, 0.1978743 ,-0.5870337 , 0.082385..."
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,113497,8844.0,3.31,"[ 0.3024645 , 0.3294625 ,-0.670344 ,-0.192972..."
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,113228,15602.0,3.17,"[-0.37820202, 0.8016621 ,-0.59272003,-0.482191..."
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,114885,31357.0,2.94,"[ 0.23999332, 0.37431288,-0.5244517 ,-0.505686..."
4,5,Father of the Bride Part II,[Comedy],1995,113041,11862.0,3.13,"[ 0.4535151 , 0.6201843 ,-0.62360257,-0.300914..."


In [15]:

merged_movies_df['embedding'] = merged_movies_df['embedding'].apply(lambda x: np.array2string(x, precision=8, separator=',', suppress_small=True))
merged_movies_df.to_csv('movies_with_embeddings.csv', index=False)



In [16]:
def recommend_similar_movies(movie_title, movies_df, top_n=5):
    target_embedding = movies_df[movies_df['title'] == movie_title]['embedding'].iloc[0]
    similarities = cosine_similarity([target_embedding], np.vstack(movies_df['embedding'].values))[0]
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    return movies_df.iloc[top_indices]['title'].tolist()


In [17]:
recommend_similar_movies('Father of the Bride Part II', merged_movies_df)

ValueError: could not convert string to float: '[ 0.4535151 , 0.6201843 ,-0.62360257,-0.30091432, 0.5028237 , 0.5409059 ,\n -0.26138285,-0.05695986,-0.3333929 ,-0.12669578, 0.136112  , 0.35680285,\n -0.26116574,-0.15284912, 0.5109149 ,-0.07911557,-0.16615786, 0.22539444,\n -0.48575857, 0.28620797, 0.2710984 , 0.09701999,-0.13576557, 0.17128958,\n  0.01381813,-1.6267573 ,-0.86243856,-0.2553256 , 0.06073929, 0.02942572,\n  2.6129744 ,-0.03390387,-0.25629288,-0.18809445, 0.1861656 , 0.20640947,\n  0.04714314,-0.08321714,-0.23183987,-0.13672258,-0.10019915, 0.2713957 ,\n -0.29145172,-0.43728787,-0.23520286, 0.01907142,-0.15873909,-0.80455005,\n -0.226473  ,-0.05866243]'