In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("MovieDataSet.csv")
df.head()

Unnamed: 0,ID,tconst,primaryTitle,startYear,rank,averageRating,numVotes,runtimeMinutes,directors,writers,genres,IMDbLink,Title_IMDb_Link
0,0,tt0111161,The Shawshank Redemption,1994,1,9.3,3090318,142,Frank Darabont,"Stephen King, Frank Darabont",Drama,https://www.imdb.com/title/tt0111161,"<a href=""https://www.imdb.com/title/tt0111161""..."
1,1,tt0068646,The Godfather,1972,2,9.2,2154590,175,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola","Crime, Drama",https://www.imdb.com/title/tt0068646,"<a href=""https://www.imdb.com/title/tt0068646""..."
2,2,tt0468569,The Dark Knight,2008,3,9.1,3065674,152,Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...","Action, Crime, Drama",https://www.imdb.com/title/tt0468569,"<a href=""https://www.imdb.com/title/tt0468569""..."
3,3,tt0167260,The Lord of the Rings: The Return of the King,2003,4,9.0,2102794,201,Peter Jackson,"J.R.R. Tolkien, Fran Walsh, Philippa Boyens, P...","Adventure, Drama, Fantasy",https://www.imdb.com/title/tt0167260,"<a href=""https://www.imdb.com/title/tt0167260""..."
4,4,tt0108052,Schindler's List,1993,5,9.0,1542434,195,Steven Spielberg,"Thomas Keneally, Steven Zaillian","Biography, Drama, History",https://www.imdb.com/title/tt0108052,"<a href=""https://www.imdb.com/title/tt0108052""..."


In [3]:
def clean_tokens(text):
    if pd.isna(text):
        return ""
    tokens = [
        t.strip().lower().replace(" ", "_")
        for t in str(text).split(",")
        if t and t.lower() != "nan"
    ]
    return " ".join(tokens)

In [4]:
df["doc"] = (
    df["directors"].apply(clean_tokens) + " " +
    df["writers"].apply(clean_tokens) + " " +
    df["genres"].apply(clean_tokens)
).str.strip()

df[["primaryTitle", "doc"]].head()

Unnamed: 0,primaryTitle,doc
0,The Shawshank Redemption,frank_darabont stephen_king frank_darabont drama
1,The Godfather,francis_ford_coppola mario_puzo francis_ford_c...
2,The Dark Knight,christopher_nolan jonathan_nolan christopher_n...
3,The Lord of the Rings: The Return of the King,peter_jackson j.r.r._tolkien fran_walsh philip...
4,Schindler's List,steven_spielberg thomas_keneally steven_zailli...


In [5]:
tfidf = TfidfVectorizer(token_pattern=r"[a-z0-9_]+")
X = tfidf.fit_transform(df["doc"])
X.shape

(5000, 8580)

In [6]:
import os, joblib
os.makedirs("models", exist_ok=True)
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")
joblib.dump(X, "models/tfidf_matrix.joblib")
df[["ID","primaryTitle","startYear","averageRating","numVotes","tconst"]].to_csv("models/movies_meta.csv", index=False)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
titles = df["primaryTitle"]
idx = titles[titles=="The Godfather"].index[0]
scores = cosine_similarity(X[idx], X).ravel()
top = scores.argsort()[::-1][1:11]
list(titles.iloc[top])

['The Godfather Part II',
 'The Godfather Part III',
 'The Conversation',
 'The Rainmaker',
 'Rumble Fish',
 'Apocalypse Now',
 'Peggy Sue Got Married',
 'The Great Gatsby',
 'The Outsiders',
 'Dracula']

In [None]:
import os, joblib
os.makedirs("models", exist_ok=True)
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")
joblib.dump(X, "models/tfidf_matrix.joblib")
df[["ID","primaryTitle","startYear","averageRating","numVotes","tconst"]].to_csv("models/movies_meta.csv", index=False)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
titles = df["primaryTitle"]
idx = titles[titles=="The Godfather"].index[0]
sims = cosine_similarity(X[idx], X).ravel()
top = sims.argsort()[::-1][1:11]
titles.iloc[top]

5        The Godfather Part II
831     The Godfather Part III
921           The Conversation
2245             The Rainmaker
2703               Rumble Fish
59              Apocalypse Now
4311     Peggy Sue Got Married
4020          The Great Gatsby
2970             The Outsiders
1385                   Dracula
Name: primaryTitle, dtype: object

In [9]:
import numpy as np, pandas as pd
k=20; rows=[]
for i in range(X.shape[0]):
    s = cosine_similarity(X[i], X).ravel()
    top = np.argpartition(-s, range(1,k+1))[1:k+1]
    top = top[np.argsort(-s[top])]
    for rank,j in enumerate(top,1):
        rows.append((i,j,float(s[j]),rank))
pd.DataFrame(rows, columns=["movie_idx","neighbor_idx","score","rank"]).to_csv("models/neighbors_top20.csv", index=False)
