In [1]:
import pandas as pd
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
ratings = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/ratings.csv')
movies = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/movies.csv')
tags = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/tags.csv')
links = pd.read_csv('https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/links.csv')

# Drop timestamp column from tags
tags.drop('timestamp', axis=1, inplace=True)

# Combine tags for each user-movie pair
tags_combined = tags.groupby(['userId', 'movieId'])['tag'].apply(', '.join).reset_index()

# Merge movies with combined tags
dataframe = pd.merge(movies, tags_combined, on='movieId', how='left')

# Convert genres column to strings separated by commas
dataframe['genres'] = dataframe['genres'].apply(lambda x: ','.join(x.split('|')))

# Fill missing tags with empty string
dataframe['tag'] = dataframe['tag'].fillna('')

# Combine tags and genres into a single 'tags' column
dataframe['tags'] = dataframe['tag'] + ', ' + dataframe['genres']

# Drop unnecessary columns
new = dataframe.drop(columns=['tag', 'genres'])

# Vectorize 'tags' column
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()

# Calculate cosine similarity
similarity = cosine_similarity(vector)

# Define API key
TMDB_API_KEY = '3b6d898137728d7df661e5ffe4934beb'

# Function to get TMDb poster URL
def get_tmdb_poster(tmdb_id):
    response = requests.get(f'https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={TMDB_API_KEY}')
    data = response.json()
    poster_path = data.get('poster_path')
    poster_url = f"https://image.tmdb.org/t/p/w500{poster_path}"
    return poster_url

# Function to recommend movies
def recommend(u_id):
    # Filter movies with ratings greater than or equal to 3 for the given user ID
    high_rated_movies = ratings[(ratings['userId'] == u_id) & (ratings['rating'] >= 3)]

    if not high_rated_movies.empty:
        # Select a random high-rated movie
        movie_id = high_rated_movies.sample(n=1)['movieId'].iloc[0]
    else:
        # If no high-rated movies, select a random movie
        movie_id = ratings['movieId'].sample(n=1).iloc[0]

    # Get the index of the movie in the DataFrame 'new'
    index = new[new['movieId'] == movie_id].index[0]

    # Find similar movies
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])

    # Print recommended movies
    for i in distances[2:7]:
        movie_title = new.iloc[i[0]]['title']
        tmdb_id = links.loc[links['movieId'] == movie_id, 'tmdbId'].values[0]
        tmdb_poster_url = get_tmdb_poster(tmdb_id)
        print("Title:", movie_title)
        print("TMDb Poster URL:", tmdb_poster_url)
        print()

# Example usage
recommend(1)


Title: Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
TMDb Poster URL: https://image.tmdb.org/t/p/w500/euypWkaYFOLW3e5rLIcTAjWnhhT.jpg

Title: Hate (Haine, La) (1995)
TMDb Poster URL: https://image.tmdb.org/t/p/w500/euypWkaYFOLW3e5rLIcTAjWnhhT.jpg

Title: Young Poisoner's Handbook, The (1995)
TMDb Poster URL: https://image.tmdb.org/t/p/w500/euypWkaYFOLW3e5rLIcTAjWnhhT.jpg

Title: New Jersey Drive (1995)
TMDb Poster URL: https://image.tmdb.org/t/p/w500/euypWkaYFOLW3e5rLIcTAjWnhhT.jpg

Title: Jason's Lyric (1994)
TMDb Poster URL: https://image.tmdb.org/t/p/w500/euypWkaYFOLW3e5rLIcTAjWnhhT.jpg



In [2]:
import pickle

# Save the cosine similarity matrix
with open('cosine_similarity_model.pkl', 'wb') as file:
    pickle.dump(similarity, file)
