In [1]:
# Importing pandas
import pandas as pd

In [2]:
# Reading the Data
data = pd.read_csv("/kaggle/input/imdb-movies-dataset/imdb_movies.csv")
data.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Preprocessing

# TF-IDF Vectorization for overviews (text similarity)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])

# Convert genres into a list of individual genres
data['genre_list'] = data['genre'].str.split(', ')


In [4]:
# Step 2: Calculate Similarity for Similar Movies (by Overview and Genre)
# Create a function to recommend similar movies
def get_similar_movies(movie_name, top_n=5):
    movie_idx = data[data['names'] == movie_name].index[0]
    
    # Compute cosine similarity between the movie overview vectors
    cosine_sim = cosine_similarity(tfidf_matrix[movie_idx], tfidf_matrix)
    
    # Combine the score with similarity for more balanced recommendations
    data['similarity'] = cosine_sim.flatten()
    
    # Sort by similarity score and return the top N similar movies
    similar_movies = data[['names', 'genre', 'score', 'similarity']].sort_values(by='similarity', ascending=False)
    
    # Exclude the original movie itself
    similar_movies = similar_movies[similar_movies['names'] != movie_name]
    
    return similar_movies.head(top_n)

In [5]:
# Example: Get Similar Movies for "Creed III"
similar_movies = get_similar_movies("Creed III")
print(similar_movies[['names', 'genre', 'score', 'similarity']])

                             names                      genre  score  \
114                       Creed II                      Drama   70.0   
115                          Creed                      Drama   74.0   
8394               Damien: Omen II           Horror, Thriller   64.0   
9093  Omen III: The Final Conflict  Horror, Mystery, Thriller   58.0   
3992                Brick Mansions       Action, Crime, Drama   59.0   

      similarity  
114     0.255926  
115     0.202163  
8394    0.198801  
9093    0.146674  
3992    0.139299  


In [6]:
# Step 3: Genre-Based Recommendations
# Function to get movies based on a preferred genre
def get_genre_recommendations(genre, top_n=5):
    # Handle NaN values in the 'genre' column by replacing them with an empty string
    data['genre'] = data['genre'].fillna('')

    # Filter movies by genre
    genre_movies = data[data['genre'].str.contains(genre, case=False)]
    
    # Sort by score or revenue (you can choose whichever you prefer)
    genre_movies = genre_movies.sort_values(by='score', ascending=False)  # Or use 'revenue'
    
    # Return top N recommendations
    return genre_movies[['names', 'genre', 'score']].head(top_n)


In [7]:
# Example: Get Action Movies
action_movies = get_genre_recommendations("Action")
print(action_movies)

                                              names  \
3886                   WWE WrestleMania 39 Saturday   
591                                 The Dark Knight   
577   The Lord of the Rings: The Return of the King   
4855                      Primal: Tales of Savagery   
3681                                  Seven Samurai   

                                    genre  score  
3886                       Action, Family   85.0  
591        Drama, Action, Crime, Thriller   85.0  
577            Adventure, Fantasy, Action   85.0  
4855  Action, Adventure, Animation, Drama   85.0  
3681                        Action, Drama   85.0  


In [8]:
# Step 4: Trending Movies (by Score or Revenue)
# Function to get top trending movies based on score or revenue
def get_trending_movies(by='score', top_n=5):
    if by == 'score':
        trending_movies = data.sort_values(by='score', ascending=False)
    elif by == 'revenue':
        trending_movies = data.sort_values(by='revenue', ascending=False)
    else:
        raise ValueError("Invalid argument. Use 'score' or 'revenue'.")
    
    return trending_movies[['names', 'score', 'revenue']].head(top_n)

In [9]:
# Example: Get Trending Movies by Score
trending_movies = get_trending_movies(by='score')
print(trending_movies)

                                   names  score       revenue
10046                           Simulant  100.0  1.569324e+09
6433        Furin, hentai, monmon chômon  100.0  1.569324e+09
4887               Pretty Young Sister 4  100.0  1.569324e+09
277                     Orgasm Lecture 2  100.0  1.569324e+09
1776   Porno document: Toruko tokkyû bin  100.0  1.569324e+09
