In [None]:
import numpy as np
import pandas as pd

In [None]:
movies = pd.read_csv("data/tmdb_5000_movies.csv")
credits = pd.read_csv("data/tmdb_5000_credits.csv")

In [None]:
movies.head()

In [None]:
credits.head()

In [None]:
credits.head(1)["crew"].values

In [None]:
movies = movies.merge(credits, on="title")

In [None]:
movies.head()

# Release date segmentation

In [None]:
movies["release_date"] = pd.to_datetime(movies["release_date"], errors="coerce")
movies["year"] = movies["release_date"].dt.year.astype("Int64")

In [None]:
movies["decade"] = (movies["year"] // 10) * 10
movies["decade"] = movies["decade"].astype(str) + "s"

# PreProcessing & Feature Engineering

In [None]:
# genres, id, keywords, title, overview, cast, crew, decade, vote_count

movies = movies[
    [
        "id",
        "title",
        "overview",
        "genres",
        "keywords",
        "cast",
        "crew",
        "decade",
        "vote_count",
    ]
]

In [None]:
movies.head()

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
movies.duplicated()

In [None]:
# Remove movies with missing decade information

movies = movies[movies["decade"] != "<NA>s"]

In [None]:
movies.iloc[0].genres

In [None]:
# convert the genres column from string to list of genre names

import ast


def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i["name"])
    return L


movies["genres"] = movies["genres"].apply(convert)

In [None]:
movies["keywords"] = movies["keywords"].apply(convert)

In [None]:
# convert the cast column from string to list of cast names (only top 3)


def convert_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter < 3:
            L.append(i["name"])
            counter += 1
        else:
            break
    return L


movies["cast"] = movies["cast"].apply(convert_cast)

In [None]:
# convert the crew column from string to list of director names (only director)


def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            L.append(i["name"])
    return L


movies["crew"] = movies["crew"].apply(fetch_director)

In [None]:
# convert the overview column from string to list of words

movies["overview"] = movies["overview"].apply(lambda x: x.split())

In [None]:
# convert decade column from string to list of decade

movies["decade"] = movies["decade"].apply(lambda x: [x])

In [None]:
movies["genres"] = movies["genres"].apply(lambda x: [i.replace(" ", "") for i in x])
movies["keywords"] = movies["keywords"].apply(lambda x: [i.replace(" ", "") for i in x])
movies["cast"] = movies["cast"].apply(lambda x: [i.replace(" ", "") for i in x])
movies["crew"] = movies["crew"].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
movies["tags"] = (
    movies["overview"]
    + movies["genres"]
    + movies["keywords"]
    + movies["cast"]
    + movies["crew"]
    + movies["decade"]
)

In [None]:
new_df = movies[["id", "title", "tags", "vote_count"]]

In [None]:
new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))

In [None]:
new_df["tags"] = new_df["tags"].apply(lambda x: x.lower())

In [None]:
new_df.head()

# Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=7000,
    stop_words="english",
    max_df=0.7,
    min_df=2,
    ngram_range=(1, 2),
    sublinear_tf=True,
)

vectors = tfidf.fit_transform(new_df["tags"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

vectors = normalize(vectors)
similarity = cosine_similarity(vectors)

In [None]:
# Function to recommend movies based on similarity scores and vote counts

def recommend(movie):
    movie_index = new_df[new_df["title"] == movie].index[0]
    distances = similarity[movie_index]

    scored_movies = []

    for i, sim_score in enumerate(distances):
        vote_count = new_df.iloc[i]["vote_count"]

        # Avoid issues if vote_count is missing
        if np.isnan(vote_count):
            vote_count = 0

        final_score = sim_score * np.log(vote_count + 1)

        scored_movies.append((i, final_score))

    # Sort based on final_score
    scored_movies = sorted(scored_movies, key=lambda x: x[1], reverse=True)

    # Exclude the input movie itself
    top_movies = scored_movies[1:6]

    for i in top_movies:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend("Batman Begins")