In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Step 2: Load dataset
df = pd.read_csv("sample_101_IMDB_10000.csv")

# View dataset information
print("Columns available in dataset:\n", df.columns)
df.head()


Columns available in dataset:
 Index(['title', 'year', 'certificate', 'runtime', 'genre', 'desc', 'rating',
       'votes'],
      dtype='object')


Unnamed: 0,title,year,certificate,runtime,genre,desc,rating,votes
0,Freddy,2022,UA 16+,124 min,"Drama, Mystery, Thriller",The lines between love and obsession blur in t...,7.9,16441
1,An Action Hero,2022,U,130 min,Action,Youth Icon. Superstar. Action Hero. At the age...,8.1,15690
2,Kantara,2022,UA,148 min,"Action, Adventure, Drama",It involves culture of Kambala and Bhootha Kol...,8.7,78358
3,Khakee: The Bihar Chapter,2022–,UA 13+,45 min,"Action, Crime, Drama",As a righteous cop pursues a merciless crimina...,8.3,4464
4,Drishyam 2,2022,UA,140 min,"Crime, Drama, Mystery",A gripping tale of an investigation and a fami...,8.6,18743


In [4]:
# Check column names
print(df.columns.tolist())


['title', 'year', 'certificate', 'runtime', 'genre', 'desc', 'rating', 'votes']


In [6]:
# Step 3: Select and clean relevant columns

# Use only the relevant columns for recommendation
columns_to_use = ['title', 'genre', 'desc']

# Keep only these columns and drop rows with missing titles
df = df[columns_to_use].dropna(subset=['title']).reset_index(drop=True)
print(f"Using columns: {columns_to_use}")
df.head()


Using columns: ['title', 'genre', 'desc']


Unnamed: 0,title,genre,desc
0,Freddy,"Drama, Mystery, Thriller",The lines between love and obsession blur in t...
1,An Action Hero,Action,Youth Icon. Superstar. Action Hero. At the age...
2,Kantara,"Action, Adventure, Drama",It involves culture of Kambala and Bhootha Kol...
3,Khakee: The Bihar Chapter,"Action, Crime, Drama",As a righteous cop pursues a merciless crimina...
4,Drishyam 2,"Crime, Drama, Mystery",A gripping tale of an investigation and a fami...


In [7]:
# Step 4: Combine selected text features into one field

def combine_features(row):
    return ' '.join([str(row[col]) for col in columns_to_use if pd.notna(row[col])])

df['combined_features'] = df.apply(combine_features, axis=1)

# Display a few examples
df[['title', 'combined_features']].head(3)


Unnamed: 0,title,combined_features
0,Freddy,"Freddy Drama, Mystery, Thriller Th..."
1,An Action Hero,An Action Hero Action Youth Icon. ...
2,Kantara,"Kantara Action, Adventure, Drama I..."


In [8]:
# Step 5: Convert text data into TF-IDF vectors

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (100, 1240)


In [9]:
# Step 6: Compute cosine similarity matrix

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Cosine similarity matrix computed successfully.")


Cosine similarity matrix computed successfully.


In [10]:
# Step 7: Define recommendation function

def recommend_movies(title, num_recommendations=10):
    if title not in df['title'].values:
        print(f"Movie '{title}' not found in dataset.")
        return
    
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    print(f"\nTop {num_recommendations} movies similar to '{title}':\n")
    for i, score in sim_scores:
        print(f"{df.iloc[i]['title']}  (Similarity: {score:.3f})")


In [12]:
# Step 8: Example usage

recommend_movies("Ram Setu")



Top 10 movies similar to 'Ram Setu':

Brahmastra Part One: Shiva  (Similarity: 0.152)
India Lockdown  (Similarity: 0.116)
Conan the Barbarian  (Similarity: 0.084)
Kantara  (Similarity: 0.076)
Sita Ramam  (Similarity: 0.069)
1917  (Similarity: 0.063)
RRR (Rise Roar Revolt)  (Similarity: 0.061)
Uunchai  (Similarity: 0.050)
Kabir Singh  (Similarity: 0.047)
Need for Speed  (Similarity: 0.047)


In [13]:
# Step 9: Optional - Interactive input

user_movie = input("Enter a movie name: ")
recommend_movies(user_movie, 10)


Enter a movie name:  3 Idiots



Top 10 movies similar to '3 Idiots':

College Romance  (Similarity: 0.231)
Lion  (Similarity: 0.092)
Ready Player One  (Similarity: 0.059)
Four More Shots Please!  (Similarity: 0.058)
India Lockdown  (Similarity: 0.048)
HIT: The First Case  (Similarity: 0.039)
Uunchai  (Similarity: 0.037)
Ponniyin Selvan: I  (Similarity: 0.037)
Padavettu  (Similarity: 0.031)
Doctor G  (Similarity: 0.029)
