<a href="https://colab.research.google.com/github/Alirezarahhmati/Movie_Recommender_System/blob/Develop/app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Recommendation System

## Load Datasets

In [1]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"alirezarahhmati","key":"73be1ed3ad085330a3f4400239645139"}'}

In [2]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [3]:
! kaggle datasets download -d rounakbanik/the-movies-dataset

Downloading the-movies-dataset.zip to /content
 96% 218M/228M [00:01<00:00, 124MB/s]
100% 228M/228M [00:01<00:00, 132MB/s]


In [4]:
! unzip /content/the-movies-dataset.zip -d /content/the_movies_dataset

Archive:  /content/the-movies-dataset.zip
  inflating: /content/the_movies_dataset/credits.csv  
  inflating: /content/the_movies_dataset/keywords.csv  
  inflating: /content/the_movies_dataset/links.csv  
  inflating: /content/the_movies_dataset/links_small.csv  
  inflating: /content/the_movies_dataset/movies_metadata.csv  
  inflating: /content/the_movies_dataset/ratings.csv  
  inflating: /content/the_movies_dataset/ratings_small.csv  


## Content Based

In [1]:
import pandas as pd

df1 = pd.read_csv('/content/the_movies_dataset/movies_metadata.csv')
df2 = pd.read_csv('/content/the_movies_dataset/credits.csv')
df3 = pd.read_csv('/content/the_movies_dataset/keywords.csv')

  df1 = pd.read_csv('/content/the_movies_dataset/movies_metadata.csv')


In [2]:
# Convert the 'id' column in both datasets to a common data type if needed
df1['id'] = df1['id'].astype(str)
df2['id'] = df2['id'].astype(str)
df3['id'] = df3['id'].astype(str)

# Concatenate the datasets based on the 'id' column
df2 = pd.merge(df1, df2, on='id')

df2 = pd.merge(df2, df3, on='id')

In [3]:
df2 = df2.dropna(subset = ['cast', 'crew', 'genres', 'title', 'keywords', 'overview'])

In [4]:
!pip install datasketch



In [5]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasketch import MinHash, MinHashLSHForest

In [6]:
# Parse the stringified features into their corresponding python objects
features = ['cast', 'crew', 'genres', 'keywords']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [7]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = []
        for item in x:
            if isinstance(item, dict) and 'name' in item:
                names.append(item['name'])
        if len(names) > 3:
            names = names[:3]
        return names
    return []

def clean_data(x):
    if isinstance(x, list):
        cleaned_list = []
        for i in x:
            if isinstance(i, str):
                cleaned_list.append(str.lower(i.replace(" ", "")))
        return cleaned_list
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''

In [10]:
def scale_popularity(x):
    if isinstance(x, int):
        return np.log(1 + x)  # Apply logarithmic transformation to integer values
    elif isinstance(x, str):
        try:
            x = int(x)  # Convert string to integer
            return np.log(1 + x)
        except ValueError:
            return 0  # Return 0 if the string cannot be converted to an integer
    return 0


In [11]:
df2['popularity_scaled'] = df2['popularity'].apply(scale_popularity)

In [12]:
# Define new director, cast, genres and keywords features that are in a suitable form.
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'genres', 'keywords']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

In [13]:
# Apply data cleaning functions
features = ['cast', 'genres', 'director', 'keywords']
for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [14]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres']) + ' ' + ' '.join(x['production_companies']) + ' ' + str(x['popularity_scaled'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [15]:
# Initialize CountVectorizer and create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [16]:
# Compute MinHash signatures for each document
minhashes = []
for i in range(count_matrix.shape[0]):
    minhash = MinHash(num_perm=128)
    for j in count_matrix[i].indices:
        minhash.update(str(j).encode('utf-8'))
    minhashes.append(minhash)

In [17]:
# Build the LSH Forest index
forest = MinHashLSHForest(num_perm=128)
for i, minhash in enumerate(minhashes):
    forest.add(i, minhash)
forest.index()

In [18]:
# Function to find similar movies using LSH
def get_recommendations(title1, title2, title3, num_results):
    # Compute the MinHash signaturefor the query movie
    query_minhash = MinHash(num_perm=128)
    query_matrix = count.transform([title1, title2, title3])
    for j in query_matrix.indices:
        query_minhash.update(str(j).encode('utf-8'))

    # Query the LSH Forest for similar movies
    result_indices = forest.query(query_minhash, num_results)

    # Return the titles of the similar movies
    return df2['title'].iloc[result_indices]

In [19]:
# Example usage
get_recommendations('Inception', 'Interstellar', 'Shutter Island', 10)

24171                          Boom!
38630       And Then There Were None
38933              Giovanni's Island
44485                Planet Earth II
39365          The Angry Birds Movie
41501      Under The Sign Of Scorpio
17215                Love & Savagery
11888                  The Condemned
39769                    The Sea Bat
21856    Princess Protection Program
Name: title, dtype: object

## Collaborative Filtering

In [9]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split


In [10]:
# Assuming your dataset is stored in a CSV file named 'ratings.csv'
data = pd.read_csv('/content/the_movies_dataset/ratings.csv')

# Define the reader object for Surprise library
reader = Reader(rating_scale=(1, 5))

# Load the dataset from pandas DataFrame using the reader object
dataset = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

In [11]:
# Split the dataset into training and testing sets (80% for training, 20% for testing)
trainset, testset = train_test_split(dataset, test_size=0.2, random_state=42)


In [None]:
# Define the Collaborative Filtering algorithm (KNNBasic)
algo = KNNBasic()

# Train the model on the training set
algo.fit(trainset)


In [None]:
# Get predictions for the test set
predictions = algo.test(testset)

# Compute the accuracy or any other evaluation metric
accuracy = accuracy.rmse(predictions)


In [None]:
# Assuming you want to generate recommendations for a specific user with ID 1
user_id = 1

# Get the list of all movie IDs
movie_ids = dataset.df['movieId'].unique()

# Remove the movies that the user has already rated
rated_movies = dataset.df.loc[dataset.df['userId'] == user_id, 'movieId'].tolist()
available_movies = [movie_id for movie_id in movie_ids if movie_id not in rated_movies]

# Predict ratings for the user and get top recommendations
top_n = 10  # Number of recommendations to generate
user_predictions = [algo.predict(user_id, movie_id) for movie_id in available_movies]
top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:top_n]

# Print the top recommendations
for recommendation in top_recommendations:
    print(f"Movie ID: {recommendation.iid}, Estimated Rating: {recommendation.est}")
