In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
df1 = pd.read_csv(r'E:\HDD storage (E)\Data Science Preparation\Machine Learning\Datasets\Comedy Movies.csv')

In [8]:
df2 = pd.read_csv(r'E:\HDD storage (E)\Data Science Preparation\Machine Learning\Datasets\Thriller Movies.csv')

In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Position        96 non-null     int64  
 1   Const           96 non-null     object 
 2   Created         96 non-null     object 
 3   Modified        96 non-null     object 
 4   Description     0 non-null      float64
 5   Title           96 non-null     object 
 6   Original Title  96 non-null     object 
 7   URL             96 non-null     object 
 8   Title Type      96 non-null     object 
 9   IMDb Rating     96 non-null     float64
 10  Runtime (mins)  94 non-null     float64
 11  Year            96 non-null     int64  
 12  Genres          96 non-null     object 
 13  Num Votes       96 non-null     int64  
 14  Release Date    96 non-null     object 
 15  Directors       96 non-null     object 
dtypes: float64(3), int64(3), object(10)
memory usage: 12.1+ KB


In [10]:
df1.head(5)

Unnamed: 0,Position,Const,Created,Modified,Description,Title,Original Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors
0,1,tt0242519,2013-08-05,2013-08-05,,Hera Pheri,Hera Pheri,https://www.imdb.com/title/tt0242519/,Movie,8.2,156.0,2000,"Action, Comedy, Crime, Drama",76409,2000-03-31,Priyadarshan
1,2,tt0374887,2013-08-05,2013-08-05,,Munna Bhai M.B.B.S.,Munna Bhai M.B.B.S.,https://www.imdb.com/title/tt0374887/,Movie,8.1,156.0,2003,"Comedy, Drama",92433,2003-12-19,Rajkumar Hirani
2,3,tt0109117,2013-08-05,2013-08-05,,Andaz Apna Apna,Andaz Apna Apna,https://www.imdb.com/title/tt0109117/,Movie,8.0,160.0,1994,"Action, Comedy, Romance",56844,1994-11-04,Rajkumar Santoshi
3,4,tt1187043,2013-08-05,2013-08-05,,3 Idiots,3 Idiots,https://www.imdb.com/title/tt1187043/,Movie,8.4,170.0,2009,"Comedy, Drama",461395,2009-12-25,Rajkumar Hirani
4,5,tt0419058,2013-08-05,2013-08-05,,Phir Hera Pheri,Phir Hera Pheri,https://www.imdb.com/title/tt0419058/,Movie,7.4,153.0,2006,"Comedy, Crime",30440,2006-06-09,Neeraj Vora


In [11]:
df2.isna().sum()

Position           0
Const              0
Created            0
Modified           0
Description       29
Title              0
Original Title     0
URL                0
Title Type         0
IMDb Rating        0
Runtime (mins)     0
Year               0
Genres             0
Num Votes          0
Release Date       0
Directors          0
dtype: int64

In [12]:
# Drop all columns except 'Title' and 'Genres'
df1.drop(columns=[col for col in df1.columns if col not in ['Title','Genres']],inplace=True)

In [13]:
# Drop all columns except 'Title' and 'Genres'
df2.drop(columns=[col for col in df2.columns if col not in ['Title','Genres']],inplace=True)

In [14]:
df2.head(5)

Unnamed: 0,Title,Genres
0,Drishyam,"Crime, Drama, Mystery, Thriller"
1,Black Friday,"Action, Crime, Drama, History"
2,NH 10,"Action, Crime, Drama, Thriller"
3,Gangs of Wasseypur,"Action, Comedy, Crime, Drama, Thriller"
4,Paan Singh Tomar,"Action, Biography, Crime, Sport, Thriller"


In [15]:
# to combine dataframe

combined_df = pd.concat([df1,df2],ignore_index=True)

In [16]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   126 non-null    object
 1   Genres  126 non-null    object
dtypes: object(2)
memory usage: 2.1+ KB


In [17]:
# create feature for recommendation
combined_df['combined'] = combined_df['Title']+" "+combined_df['Genres']

In [18]:
#Convert Text to Numbers (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(combined_df['combined'])

In [19]:
# Compute Cosine Similarity
similarity_matrix = cosine_similarity(tfidf_matrix)

In [20]:
def recommend_movies(title, df=combined_df, similarity_matrix=similarity_matrix, top_n=30):
    if title not in df['Title'].values:
        return "Movie not found in the list."

    # Find index of the movie
    idx = df[df['Title'] == title].index[0]

    # Get similarity scores for that movie
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # Sort by score and remove the movie itself
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get recommended titles
    recommended_titles = [df.iloc[i[0]]['Title'] for i in sim_scores]

    return recommended_titles


In [21]:
# Enter the name of movie
recommendations = recommend_movies("3 Idiots")
print(recommendations)

['Golmaal 3', 'Housefull', 'One Two Three', "It's Entertainment", 'Welcome Back', 'Hungama', 'Partner', 'Biwi No. 1', 'Kunwara', 'Welcome', 'Judwaa', 'Hulchul', 'Ishq', 'Housefull 2', 'No Entry', 'Hero No. 1', 'Munna Bhai M.B.B.S.', 'Double Dhamaal', 'Double Dhamaal', 'Hera Pheri', 'Raja Babu', 'Heyy Babyy', 'Bheja Fry', 'Chashme Baddoor', 'Do Knot Disturb', 'Malamaal Weekly', 'Namastey London', 'Desi Boyz', 'Khatta Meetha', 'Jolly LLB']


In [22]:
df1.head(5)

Unnamed: 0,Title,Genres
0,Hera Pheri,"Action, Comedy, Crime, Drama"
1,Munna Bhai M.B.B.S.,"Comedy, Drama"
2,Andaz Apna Apna,"Action, Comedy, Romance"
3,3 Idiots,"Comedy, Drama"
4,Phir Hera Pheri,"Comedy, Crime"


In [23]:
df2.head(5)

Unnamed: 0,Title,Genres
0,Drishyam,"Crime, Drama, Mystery, Thriller"
1,Black Friday,"Action, Crime, Drama, History"
2,NH 10,"Action, Crime, Drama, Thriller"
3,Gangs of Wasseypur,"Action, Comedy, Crime, Drama, Thriller"
4,Paan Singh Tomar,"Action, Biography, Crime, Sport, Thriller"
