<a href="https://colab.research.google.com/github/Arajesh03/Arajesh03/blob/main/Build_a_Movie_Recommendation_System_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####Pip install

In [None]:
!pip install pandas



In [None]:
!pip install regex



In [None]:
!pip install scikit-learn



In [None]:
!pip install ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


####Data Prep


In [None]:
# Import important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer # Tfidf matrix
from sklearn.metrics.pairwise import cosine_similarity # Cosine similarity
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
movies_df = pd.read_csv('movies.csv')

In [None]:
# Function to remove any character from title that is not a letter, didgit, or space.
def clean_title(title: str) -> str:
    title =  re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [None]:
# Applying the clean_title function and creating clean_title column
movies_df['clean_title'] = movies_df['title'].apply(clean_title)
movies_df

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [None]:
# Creating Tfidf matrix
# Initializing the vectorize
vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Creating the tfidf matrix
tfidf = vectorizer.fit_transform(movies_df['clean_title'])

In [None]:
tfidf.shape

(62423, 170073)

####Cosine Similarity
- Cosine Similarity measures how "close" two documents are in terms of angles between their high-dimensional feature vectors- ignoring the lengths absolute legnths,
- A score of 1 means the vectors point in exactly the same direction.
- A score of 0 means they're unrelated
- A score of -1 means exactly opposite


In [None]:
# Search function which returns the top n most similar movie titles in df to the given query string
def search(query, movies_df=movies_df, tfidf=tfidf, top_n=10):
    # Clean the query string
    q_clean = clean_title(query)
    # Vectorize the query string
    q_vec = vectorizer.transform([q_clean])

    # Compute cosine similarities
    similarity = cosine_similarity(q_vec, tfidf).flatten()

    # The top 10 indices
    top_idx = similarity.argsort()[-top_n:][::-1]

    # Return the top 5 movie titles
    results = movies_df.iloc[top_idx][['title']].copy()
    results['score'] = similarity[top_idx]
    return results



In [None]:
search('Avengers',movies_df,tfidf)

Unnamed: 0,title,score
34536,3 Avengers (1964),0.581033
17067,"Avengers, The (2012)",0.546731
2063,"Avengers, The (1998)",0.531099
40636,Shaolin Avengers (1994),0.438559
45394,Ultimate Avengers 2 (2006),0.436821
30431,Avengers Grimm (2015),0.43394
35372,Masked Avengers (1981),0.419307
40637,The Shaolin Avengers (1976),0.386119
25067,Avengers: Infinity War - Part I (2018),0.334839
25058,Avengers: Age of Ultron (2015),0.333186


####Widget - Search bar
- Using ipywidgets

In [None]:
# Input text
search_box = widgets.Text(
    value="",
    placeholder="Search...",
    description="Search:",
    layout=widgets.Layout(width="60%"),
    disabled=False
)

# Output
output_box = widgets.Output()

def on_search(change):
    with output_box:
        clear_output(wait=True)
        query = change['new'].strip()
        if query:
            results = search(query,movies_df,tfidf)
            display(results)


search_box.observe(on_search, names='value')

# Display the widgets
display(search_box, output_box)

Text(value='', description='Search:', layout=Layout(width='60%'), placeholder='Search...')

Output()

####Ratings - second half of recommendation

In [None]:
ratings_df = pd.read_csv('ratings.csv')

In [None]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
# Example movie id and movie we are using
movie_id = 89745
movie = movies_df[movies_df['movieId'] == movie_id]
movie

Unnamed: 0,movieId,title,genres
17067,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX


In [None]:
# Similar users
similar_users = (
    ratings_df
    .query("movieId == @movie_id and rating >= 4")
    ['userId']
    .unique()
)

n_sim = len(similar_users)
print(f"Found {n_sim} similar users.")

Found 10502 similar users.


In [None]:
sub = ratings_df[
    (ratings_df.userId.isin(similar_users)) &
    (ratings_df.movieId != movie_id)

]

In [None]:
# For each movieId, count how many movies of similar users rated it >= 4
liked_counts = (
    sub[sub.rating >= 4]
    .groupby('movieId')['userId']
    .nunique()
)

rec_frac = liked_counts / n_sim

In [None]:
# drop the original movie if it slipped in
rec_frac = rec_frac.drop(index=movie_id, errors='ignore')

# keep only movies liked by >10% of these users
popular = rec_frac[rec_frac > 0.10].sort_values(ascending=False)

print(popular)

movieId
58559     0.708627
79132     0.684536
2571      0.665968
59315     0.655780
7153      0.626262
            ...   
38061     0.101504
61132     0.101219
88810     0.101028
4720      0.100743
103042    0.100457
Name: userId, Length: 369, dtype: float64


In [None]:
# List of recommendee movie IDs
rec_movies = popular.index.tolist()

In [None]:
# All users who rated any of these recommended movies >= 4
rating_rec = ratings_df[
    (ratings_df.movieId.isin(rec_movies)) &
    (ratings_df.rating >= 4)
]

users_who_rated = rating_rec.userId.unique()
print(f"Users who liked any recommended movies: {len(users_who_rated)}")

Users who liked any recommended movies: 159933


In [None]:
# Total number of unique users
total_users = ratings_df.userId.unique()
print(f"Total number of users: {len(total_users)}")

Total number of users: 162541


In [None]:
# For each recommended movie, count how many unique users liked it
user_counts = (
    rating_rec
    .groupby('movieId')
    .userId
    .nunique()
)

In [None]:
# The percentage of users who liked each movie
user_per = (user_counts / len(total_users)) * 100

In [None]:
rec_stats = user_per.to_frame(name='user_per') \
                        .sort_values('user_per', ascending=False)

In [None]:
print(rec_stats)

          user_per
movieId           
318      43.382285
296      38.400157
356      36.221630
593      35.664232
2571     34.294117
...            ...
103228    1.192315
122906    1.149864
122914    1.148633
106072    1.122178
103042    0.906233

[369 rows x 1 columns]


####Recommendation Score

In [None]:
similar_rec = rec_frac

In [None]:
all_rec = user_counts / len(total_users)

In [None]:
rec_per = pd.concat([similar_rec, all_rec], axis=1)
rec_per.columns = ['similar', 'all']

In [None]:
rec_per['score'] = rec_per['similar'] / rec_per['all']

In [None]:
rec_per = rec_per.sort_values('score', ascending=False)

In [None]:
# Top 10 movie recommendations
top10 = rec_per.head(10).reset_index().rename(columns={'index': 'movieId'})

top10 = top10.merge(movies_df[['movieId','title','genres']], on='movieId', how='left')

print(top10[['movieId','title','genres','similar','all','score']])

   movieId                                       title  \
0   122892              Avengers: Age of Ultron (2015)   
1   106072                 Thor: The Dark World (2013)   
2   102125                           Iron Man 3 (2013)   
3   110102  Captain America: The Winter Soldier (2014)   
4   122920           Captain America: Civil War (2016)   
5    88140   Captain America: The First Avenger (2011)   
6   122900                              Ant-Man (2015)   
7   103042                         Man of Steel (2013)   
8    86332                                 Thor (2011)   
9    95510              Amazing Spider-Man, The (2012)   

                                 genres   similar       all      score  
0               Action|Adventure|Sci-Fi  0.289088  0.022511  12.841929  
1         Action|Adventure|Fantasy|IMAX  0.142354  0.011222  12.685491  
2           Action|Sci-Fi|Thriller|IMAX  0.274424  0.022653  12.114378  
3          Action|Adventure|Sci-Fi|IMAX  0.356504  0.030054  11.86211

####Create a function - returns the top 10 recommendations

In [None]:
def recommend_niche(movie_id, ratings_df, movies_df,
                    like_threshold=4.0, min_frac=0.1, top_n=10):
    # 1) Who liked the target movie?
    sim_users = ratings_df.loc[
        (ratings_df['movieId'] == movie_id) &
        (ratings_df['rating']  >= like_threshold),
        'userId'
    ].unique()
    n_sim = len(sim_users)
    if n_sim == 0:
        return pd.DataFrame(columns=['score','title','genres'])

    # 2) Fraction of those users who liked each OTHER movie
    sub = ratings_df.loc[
        (ratings_df['userId'].isin(sim_users)) &
        (ratings_df['movieId'] != movie_id)
    ]
    sim_likes = sub.loc[
        sub['rating'] >= like_threshold
    ].groupby('movieId')['userId'].nunique()
    frac_sim = sim_likes / n_sim
    frac_sim = frac_sim[frac_sim >= min_frac]

    if frac_sim.empty:
        return pd.DataFrame(columns=['score','title','genres'])

    # 3) Fraction of ALL users who liked each movie
    total_users = ratings_df['userId'].nunique()
    all_likes = ratings_df.loc[
        ratings_df['rating'] >= like_threshold
    ].groupby('movieId')['userId'].nunique()
    frac_all  = all_likes / total_users

    # 4) Combine and compute the niche score
    rec_per = pd.concat(
        [frac_sim.rename('similar'), frac_all.rename('all')],
        axis=1, join='inner'
    ).dropna()
    rec_per['score'] = rec_per['similar'] / rec_per['all']

    # 5) Top‐N and merge titles+genres
    top = (
        rec_per
        .sort_values('score', ascending=False)
        .head(top_n)
        .reset_index()
        .rename(columns={'index':'movieId'})
    )
    top = top.merge(
        movies_df[['movieId','title','genres']],
        on='movieId',
        how='left'
    )

    return top[['score','title','genres']]



In [None]:
movie_id = 25058
results    = recommend_niche(movie_id, ratings_df, movies_df)
print(results)

Empty DataFrame
Columns: [score, title, genres]
Index: []


####Interactive Recommendation System

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output



search_box = widgets.Text(
    value='',
    placeholder='Type a movie title…',
    description='Search:',
    layout=widgets.Layout(width='70%')
)
output = widgets.Output()

def on_search_change(change):
    with output:
        clear_output()
        query = change['new'].strip()
        if not query:
            return

        # call your search function using exactly the names it needs:
        matches = search(query, movies_df, tfidf)
        if matches.empty:
            print("No matching titles found.")
            return

        movie_id = matches.index[0]
        title    = matches.iloc[0]['title']
        print(f"Top match: {title} (movieId={movie_id})\n")

        recs = recommend_niche(movie_id, ratings_df, movies_df)
        if recs.empty:
            print("No niche recommendations found.")
        else:
            display(recs)

search_box.observe(on_search_change, names='value')
display(search_box, output)

Text(value='', description='Search:', layout=Layout(width='70%'), placeholder='Type a movie title…')

Output()