# Movie Recommendation System

## Load the dataset

In [3]:
import pandas as pd

# Load the datasets
ratings_df = pd.read_csv('ratings.csv')
movies_df = pd.read_csv('movies.csv')
links_df = pd.read_csv('links.csv')
tags_df = pd.read_csv('tags.csv')

print("Ratings DataFrame:")
print(ratings_df.head())
print("\nMovies DataFrame:")
print(movies_df.head())
print("\nLinks DataFrame:")
print(links_df.head())

Ratings DataFrame:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Links DataFrame:
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        

### Exploring unique users and movies

In [5]:
print(f"Unique users: {ratings_df['userId'].nunique()}")
print(f"Unique movies: {ratings_df['movieId'].nunique()}")

Unique users: 610
Unique movies: 9724


In [6]:

# Function to print description
def describe_dataframe(df, name):
    print(f"\n📂 {name} DataFrame")
    print("-" * 40)
    print("Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("\nSample rows:\n", df.head(), "\n")
    print("Null values:\n", df.isnull().sum(), "\n")
    print("="*60)

# Explore each dataset
describe_dataframe(ratings_df, "Ratings")
describe_dataframe(movies_df, "Movies")
describe_dataframe(tags_df, "Tags")
describe_dataframe(links_df, "Links")



📂 Ratings DataFrame
----------------------------------------
Shape: (100836, 4)
Columns: ['userId', 'movieId', 'rating', 'timestamp']

Sample rows:
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931 

Null values:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64 


📂 Movies DataFrame
----------------------------------------
Shape: (9742, 3)
Columns: ['movieId', 'title', 'genres']

Sample rows:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  

In [7]:
# Merge movies with tags
tags_df = tags_df.fillna("")
merged_df = pd.merge(movies_df, tags_df, on="movieId", how="left")

# Combine genres and tags into one "content" column
merged_df["content"] = merged_df["genres"] + " " + merged_df["tag"].fillna("")


## Text Vectorization (TF-IDF)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace NaN with empty strings
merged_df["content"] = merged_df["content"].fillna("")

tfidf = TfidfVectorizer(stop_words="english")
content_matrix = tfidf.fit_transform(merged_df["content"])

print("Content matrix shape:", content_matrix.shape)


Content matrix shape: (11853, 1677)


### Compute similarity 

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(content_matrix, content_matrix)


In [12]:
# Reset index for lookup
merged_df = merged_df.reset_index()

def recommend_movies(title, n=5):
    # Find index of movie
    idx = merged_df[merged_df['title'].str.contains(title, case=False, na=False)].index[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Pick top n similar movies (excluding the input movie itself)
    sim_scores = sim_scores[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return merged_df[['title', 'genres', 'tag']].iloc[movie_indices]

# Example
print(recommend_movies("horror", 5))


                                   title                         genres  tag
1611       Amityville Horror, The (1979)  Drama|Horror|Mystery|Thriller  NaN
4446                 Santa Sangre (1989)  Drama|Horror|Mystery|Thriller  NaN
4486                  Others, The (2001)  Drama|Horror|Mystery|Thriller  NaN
4510              Wicker Man, The (1973)  Drama|Horror|Mystery|Thriller  NaN
5598  Tenant, The (Locataire, Le) (1976)  Drama|Horror|Mystery|Thriller  NaN


In [13]:
# Compute average rating per movie
rating_summary = ratings_df.groupby("movieId")["rating"].mean().reset_index()
rating_summary.columns = ["movieId", "avg_rating"]

# Merge with movies
merged_df = merged_df.merge(rating_summary, on="movieId", how="left")

# When sorting recommendations, boost by rating
def recommend_movies_with_ratings(title, n=5):
    idx = merged_df[merged_df['title'].str.contains(title, case=False, na=False)].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:50]  # take top 50 similar
    movie_indices = [i[0] for i in sim_scores]
    
    # Sort again by avg_rating for better results
    candidates = merged_df.iloc[movie_indices][["title", "genres", "avg_rating"]]
    return candidates.sort_values(by="avg_rating", ascending=False).head(n)

print(recommend_movies_with_ratings("Toy Story", 5))



                                             title  \
11642                  Wow! A Talking Fish! (1983)   
11618          Last Year's Snow Was Falling (1983)   
10634     Stuart Little 3: Call of the Wild (2005)   
9709                                 Presto (2008)   
11626  Vovka in the Kingdom of Far Far Away (1965)   

                                     genres  avg_rating  
11642     Animation|Children|Comedy|Fantasy         5.0  
11618     Animation|Children|Comedy|Fantasy         5.0  
10634     Animation|Children|Comedy|Fantasy         5.0  
9709      Animation|Children|Comedy|Fantasy         5.0  
11626  Adventure|Animation|Children|Fantasy         5.0  


In [14]:
print(merged_df.columns)


Index(['index', 'movieId', 'title', 'genres', 'userId', 'tag', 'timestamp',
       'content', 'avg_rating'],
      dtype='object')


In [15]:
links_df = pd.read_csv("links.csv")

# Merge movies with links (make sure movieId is the key)
merged_df = pd.merge(movies_df, links_df, on="movieId", how="left")

print(merged_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  imdbId   tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  114709    862.0  
1                   Adventure|Children|Fantasy  113497   8844.0  
2                               Comedy|Romance  113228  15602.0  
3                         Comedy|Drama|Romance  114885  31357.0  
4                                       Comedy  113041  11862.0  


In [16]:
# Check missing values
print(merged_df["tmdbId"].isnull().sum())


8


In [17]:
merged_df = merged_df.dropna(subset=["tmdbId"])


In [18]:
merged_df["tmdbId"] = merged_df["tmdbId"].astype(int)


In [19]:
# Merge movies + tags
tags_df = tags_df.fillna("")
merged_df = pd.merge(movies_df, tags_df, on="movieId", how="left")
merged_df["content"] = merged_df["genres"] + " " + merged_df["tag"].fillna("")
merged_df["content"] = merged_df["content"].fillna("")

# Merge links (do this BEFORE TF-IDF or any other operations)
merged_df = pd.merge(merged_df, links_df, on="movieId", how="left")

# Merge ratings
rating_summary = ratings_df.groupby("movieId")["rating"].mean().reset_index()
rating_summary.columns = ["movieId", "avg_rating"]
merged_df = pd.merge(merged_df, rating_summary, on="movieId", how="left")


In [20]:
tfidf = TfidfVectorizer(stop_words="english")
content_matrix = tfidf.fit_transform(merged_df["content"])
cosine_sim = cosine_similarity(content_matrix, content_matrix)


In [21]:
import requests
import pandas as pd
from IPython.display import HTML

# TMDB API
API_KEY = "a3a909bb6bc322d2fda580d6a44db902"  # replace with your TMDB key

def get_tmdb_info(tmdb_id):
    """Fetch overview and TMDB link"""
    if pd.isna(tmdb_id):
        return "No overview available", "#"
    url = f"https://api.themoviedb.org/3/movie/{int(tmdb_id)}?api_key={API_KEY}&language=en-US"
    try:
        response = requests.get(url).json()
        overview = response.get("overview", "No overview available")
        link = f"https://www.themoviedb.org/movie/{int(tmdb_id)}"
        return overview, link
    except:
        return "No overview available", "#"

def make_clickable(link, text="Link"):
    return f'<a href="{link}" target="_blank">{text}</a>'

def recommend_movies_full(title, n=5):
    # Find movie
    matches = merged_df[merged_df['title'].str.contains(title, case=False, na=False)]
    if matches.empty:
        print("Movie not found!")
        return HTML("<p>No recommendations available</p>")
    
    idx = matches.index[0]
    
    # Similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
    sim_scores = sim_scores[:min(50, len(sim_scores))]
    movie_indices = [i[0] for i in sim_scores if i[0] < len(merged_df)]
    
    # Candidates
    candidates = merged_df.iloc[movie_indices][["title", "genres", "avg_rating", "tmdbId", "imdbId", "movieId"]].copy()
    
    # Add TMDB overview and link
    candidates[["overview", "tmdb_link"]] = candidates["tmdbId"].apply(lambda x: pd.Series(get_tmdb_info(x)))
    
    # Add MovieLens and IMDb links
    candidates["movielens_link"] = candidates["movieId"].apply(lambda x: f"https://movielens.org/movies/{int(x)}")
    candidates["imdb_link"]       = candidates["imdbId"].apply(lambda x: f"http://www.imdb.com/title/tt{int(x)}/" if not pd.isna(x) else "#")
    
    # Convert links to clickable HTML
    candidates["TMDB"]       = candidates["tmdb_link"].apply(lambda x: make_clickable(x, "TMDB"))
    candidates["MovieLens"]  = candidates["movielens_link"].apply(lambda x: make_clickable(x, "MovieLens"))
    candidates["IMDb"]       = candidates["imdb_link"].apply(lambda x: make_clickable(x, "IMDb"))
    
    # Sort by rating and take top n
    candidates = candidates.sort_values(by="avg_rating", ascending=False).head(n)
    
    # Select final columns
    display_df = candidates[["title", "genres", "avg_rating", "overview", "TMDB", "MovieLens", "IMDb"]]
    
    # Render clickable links in Jupyter
    return HTML(display_df.to_html(escape=False))


# -------------------------------
# Run recommender with user input
# -------------------------------
movie_name = input("Search by movie title or genre: ")
recommend_movies_full(movie_name, 5)


Search by movie title or genre:  Horror


Unnamed: 0,title,genres,avg_rating,overview,TMDB,MovieLens,IMDb
7041,'Salem's Lot (2004),Drama|Horror|Mystery|Thriller,5.0,No overview available,TMDB,MovieLens,IMDb
9473,Kill List (2011),Horror|Mystery|Thriller,4.5,"Nearly a year after a botched job, a hitman takes a new assignment with the promise of a big payoff for three killings. What starts off as an easy task soon unravels, sending the killer into the heart of darkness.",TMDB,MovieLens,IMDb
9187,Saw VII 3D - The Final Chapter (2010),Horror|Mystery|Thriller,4.5,"As a deadly battle rages over Jigsaw's brutal legacy, a group of Jigsaw survivors gathers to seek the support of self-help guru and fellow survivor Bobby Dagen, a man whose own dark secrets unleash a new wave of terror.",TMDB,MovieLens,IMDb
5598,"Tenant, The (Locataire, Le) (1976)",Drama|Horror|Mystery|Thriller,4.25,A quiet and inconspicuous man rents an apartment in Paris where he finds himself drawn into a rabbit hole of dangerous paranoia.,TMDB,MovieLens,IMDb
8938,Triangle (2009),Drama|Horror|Mystery|Thriller,4.0,"When Jess sets sail on a yacht with a group of friends, she cannot shake the feeling that there is something wrong.",TMDB,MovieLens,IMDb
