<a href="https://colab.research.google.com/github/FathimaHusna/FathimaHusna/blob/main/MovieRecommendationSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Connect the Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Import datasets

In [None]:
import pandas as pd


View the dataset

In [None]:
Movie_df = pd.read_csv("/content/drive/MyDrive/Data Mining/ml-25m/movies.csv")
Movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Get the total columns and thr rows in the dataset.

In [None]:
Movie_df.shape

(62423, 3)

Get the summary information about the dataset's structure, including the number of non-null entries, data types, and memory usage

In [None]:
Movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


Count the dupplicate data

In [None]:
Movie_df.duplicated().sum()

0

Count the null data

In [None]:
Movie_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

Clean title

In [None]:

import re #search engine

#clean extra characters to make search easy
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title



In [None]:
Movie_df["clean_title"] = Movie_df["title"].apply(clean_title)


In [None]:
Movie_df["clean_title"]

0                          Toy Story 1995
1                            Jumanji 1995
2                   Grumpier Old Men 1995
3                  Waiting to Exhale 1995
4        Father of the Bride Part II 1995
                       ...               
62418                             We 2018
62419             Window of the Soul 2001
62420                      Bad Poems 2018
62421                   A Girl Thing 2001
62422         Women of Devils Island 1962
Name: clean_title, Length: 62423, dtype: object

Create a TFIDF matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer #to turn titles into numbers
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #look the group of 2 consecutive words in title

tfidf = vectorizer.fit_transform(Movie_df["clean_title"]) # turn into matrix


Creating a search function

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def search(title):
    #title = "Toy Story 1995"
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    #The titles with greatest similarity
    indices = np.argpartition(similarity, -5)[-5:] #most similar 5 titles
    results = Movie_df.iloc[indices].iloc[::-1] # most similar movie at the top :-1
    return results


Interactive search box

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 1:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

Movie rating data

In [None]:
Ratings_df = pd.read_csv("/content/drive/MyDrive/Data Mining/ml-25m/ratings.csv")
Ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
Ratings_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

User who liked the same movie

In [None]:
movie_id = 1




Recommendation Function

In [None]:
def find_similar_movies(movie_id):
    similar_users = Ratings_df[(Ratings_df["movieId"] == movie_id) & (Ratings_df["rating"] > 4)]["userId"].unique()
    similar_user_recs = Ratings_df[(Ratings_df["userId"].isin(similar_users)) & (Ratings_df["rating"] > 4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    all_users = Ratings_df[(Ratings_df["movieId"].isin(similar_user_recs.index)) & (Ratings_df["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(Movie_df, left_index=True, right_on="movieId")[["score", "title", "genres"]]

Interactive Recommendation Widget

In [None]:


import ipywidgets as widgets
from IPython.display import display

Movie_Name_Input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

Movie_Name_Input.observe(on_type, names='value')

display(Movie_Name_Input, recommendation_list)



Text(value='Toy Story', description='Movie Title:')

Output()