In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer



# Load the datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Extract the year from the movie titles
movies['year'] = movies['title'].str.extract('\((\d{4})\)', expand=False)

# Remove the year from the movie titles
movies['title'] = movies['title'].str.replace('\(\d{4}\)', '')

# Convert the year column to integers
movies['year'] = pd.to_numeric(movies['year'])

# Display the updated dataframe
print(movies.head())


   movieId                         title  \
0        1                    Toy Story    
1        2                      Jumanji    
2        3             Grumpier Old Men    
3        4            Waiting to Exhale    
4        5  Father of the Bride Part II    

                                        genres    year  
0  Adventure|Animation|Children|Comedy|Fantasy  1995.0  
1                   Adventure|Children|Fantasy  1995.0  
2                               Comedy|Romance  1995.0  
3                         Comedy|Drama|Romance  1995.0  
4                                       Comedy  1995.0  


  movies['title'] = movies['title'].str.replace('\(\d{4}\)', '')


In [21]:

# Load the dataset
df = pd.read_csv('ratings.csv')

# Check for duplicates
duplicates = df[df.duplicated()]

if len(duplicates) > 0:
    print('There are duplicates in the dataset.')
else:
    print('The dataset has no duplicates.')


The dataset has no duplicates.


In [22]:
ratings = pd.read_csv('ratings.csv')

# Calculate mean rating for each movie
movie_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()

# Print the resulting dataframe
print(movie_ratings.head())

   movieId    rating
0        1  3.893708
1        2  3.251527
2        3  3.142028
3        4  2.853547
4        5  3.058434


In [24]:
# Merge the mean ratings with the movies dataframe


print(movies)

       movieId                         title  \
0            1                    Toy Story    
1            2                      Jumanji    
2            3             Grumpier Old Men    
3            4            Waiting to Exhale    
4            5  Father of the Bride Part II    
...        ...                           ...   
62418   209157                           We    
62419   209159           Window of the Soul    
62420   209163                    Bad Poems    
62421   209169                 A Girl Thing    
62422   209171      Women of Devil's Island    

                                            genres    year  
0      Adventure|Animation|Children|Comedy|Fantasy  1995.0  
1                       Adventure|Children|Fantasy  1995.0  
2                                   Comedy|Romance  1995.0  
3                             Comedy|Drama|Romance  1995.0  
4                                           Comedy  1995.0  
...                                            ...     ..

In [None]:
merged_df = merged_df.drop(columns=['rating_x', 'rating_y'])


In [25]:
# Preprocess genres column
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

# Split dataset into training and testing sets
train_df, test_df = train_test_split(movies, test_size=0.2, random_state=42)

# Compute TF-IDF scores for genres
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(train_df['genres'].apply(lambda x: ' '.join(x)))

# Calculate cosine similarity scores for movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define function to recommend similar movies
def get_recommendations(title, n=10, genre=None, rating=None, year=None):
    # Find index of movie
    idx = movies[movies['title'] == title].index[0]

    # Calculate cosine similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top n similar movies
    sim_scores = sim_scores[1:n+1]

    # Get movie indices and filter based on user preferences
    movie_indices = [i[0] for i in sim_scores]
    if genre:
        movie_indices = [i for i in movie_indices if genre in movies.iloc[i]['genres']]
    if rating:
        movie_indices = [i for i in movie_indices if movies.iloc[i]['rating'] >= rating]
    if year:
        movie_indices = [i for i in movie_indices if str(year) in movies.iloc[i]['title']]

    # Return recommended movies
    return movies.iloc[movie_indices][['title', 'genres', 'rating', 'year']]
