In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows


In [None]:
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

df = pd.read_csv(csv_path)

overview = df['Overview']
overview = overview.fillna('').str.lower()


filters = []
for ov in overview:
  tokens = re.findall(r'\b\w+\b', ov)
  filtered_tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
  filters.append(" ".join(filtered_tokens))



In [None]:
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

csv_path = '/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv'
df = pd.read_csv(csv_path)

# Clean overview data
overview = df['Overview'].fillna('').str.lower()

# Tokenize the overview section
# Remove any common words
filtered_lists = []
for ov in overview:
    tokens = re.findall(r'\b\w+\b', ov)
    filtered = [t for t in tokens if t not in ENGLISH_STOP_WORDS]
    filtered_lists.append(filtered)

# Count tokens
all_tokens = [tok for sublist in filtered_lists for tok in sublist]
counter = Counter(all_tokens)
common_tokens = counter.most_common(20)

token_df = pd.DataFrame(common_tokens, columns=['token', 'count'])
print(token_df)

       token  count
0          s    239
1      young    132
2        man    119
3       life    111
4      world     85
5        new     73
6     family     66
7        war     66
8      woman     65
9      story     63
10      love     61
11       old     54
12     finds     47
13       boy     46
14      help     45
15    father     45
16      wife     44
17      girl     42
18  american     40
19     years     39


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Preprocess the data: remove NaNs, lowercase, tokenize & filter stop words
def preprocess(text):
    tokens = re.findall(r'\b\w+\b', str(text).lower())
    return " ".join([t for t in tokens if t not in ENGLISH_STOP_WORDS])

df['filtered_overview'] = df['Overview'].apply(preprocess)

# Vectorize with TF-IDF (Change words into numbers that a model can recognize)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(df['filtered_overview'])

# 4. Train/test split
y = df['Genre']
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

# 5. Preview
feature_names = vectorizer.get_feature_names_out()[:10]
preview = pd.DataFrame({
    'feature': feature_names,
    'idf': vectorizer.idf_[:10]
})

# View shape of the matrix
print(preview)
print(f"TF-IDF matrix shape: {X_tfidf.shape}")



            feature       idf
0         00 status  7.215608
1               000  6.116995
2  000 deutschmarks  7.215608
3      000 employer  7.215608
4       000 savings  7.215608
5        000 stolen  7.215608
6      000 stranded  7.215608
7               007  6.810142
8          007 bond  7.215608
9         007 track  7.215608
TF-IDF matrix shape: (1000, 5000)


In [None]:
from collections import Counter

# Count number of movies per genre
all_genres = df['Genre'].str.split(',').explode().str.strip()
Counter(all_genres)

Counter({'Drama': 724,
         'Crime': 209,
         'Action': 189,
         'Adventure': 196,
         'Biography': 109,
         'History': 56,
         'Sci-Fi': 67,
         'Romance': 125,
         'Western': 20,
         'Fantasy': 66,
         'Comedy': 233,
         'Thriller': 137,
         'Animation': 82,
         'Family': 56,
         'War': 51,
         'Mystery': 99,
         'Music': 35,
         'Horror': 32,
         'Musical': 17,
         'Film-Noir': 19,
         'Sport': 19})

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

# Genres: Make genre list a binary matrix to train
mlb = MultiLabelBinarizer(sparse_output=True)
genre_mat = mlb.fit_transform(df['Genre'].str.split(',').apply(lambda lst: [g.strip() for g in lst]))

# Director: One hot encoding
dir_onehot = pd.get_dummies(df['Director'], prefix='dir', sparse=True)
stars = df[['Star1','Star2','Star3','Star4']].apply(lambda col: col.str.strip())

#  Top-4 stars: Make genre list a binary matrix to train
star_mlb = MultiLabelBinarizer(sparse_output=True)
star_mat = star_mlb.fit_transform(stars.values.tolist())

# Numeric features: Scale the features
num = df[['IMDB_Rating','Meta_score','No_of_Votes']].fillna(0)
scaler = StandardScaler(with_mean=False)  # sparse compatibility
num_mat = scaler.fit_transform(num)

In [None]:
from scipy.sparse import hstack

# Format the data
X = hstack([
    X_tfidf,
    genre_mat,
    dir_onehot.sparse.to_coo(),
    star_mat,
    num_mat
]).tocsr()

In [None]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(X)

def get_recs(title, k=10):
    idx = df.index[df['Series_Title']==title][0]
    dists, idxs = model.kneighbors(X[idx], n_neighbors=k+1)
    # drop the query movie itself
    rec_idxs = idxs.flatten()[1:]
    return df.loc[rec_idxs, 'Series_Title'].tolist()

In [None]:
for title in ["Inception", "The Godfather", "Toy Story"]:
    print(f"Seed: {title}")
    print(get_recs(title, k=5))
    print()


Seed: Inception
['The Dark Knight Rises', 'Batman Begins', 'The Dark Knight', 'The Matrix', 'Interstellar']

Seed: The Godfather
['The Godfather: Part II', 'Pulp Fiction', 'The Silence of the Lambs', 'Se7en', 'The Departed']

Seed: Toy Story
['Toy Story 2', 'Toy Story 3', 'Up', 'Finding Nemo', 'Toy Story 4']



In [None]:
import numpy as np

def genre_match_rate(k=10, samples=100):
    sample_titles = df['Series_Title'].sample(samples, random_state=42)
    match_counts = []
    for title in sample_titles:
        seed_genres = set(g.strip() for g in df.loc[df.Series_Title==title, 'Genre'].iloc[0].split(','))
        recs = get_recs(title, k)
        matches = 0
        for r in recs:
            rec_genres = set(g.strip() for g in df.loc[df.Series_Title==r, 'Genre'].iloc[0].split(','))
            if seed_genres & rec_genres:
                matches += 1
        match_counts.append(matches / k)
    return np.mean(match_counts)

print("Avg. genre-overlap@10:", genre_match_rate(10, samples=200))

Avg. genre-overlap@10: 0.993
