In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, pairwise_distances, accuracy_score, confusion_matrix, classification_report
import warnings, os
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")


base_path = "/kaggle/input"
folders = os.listdir(base_path)

csv_path = None
for folder in folders:
    if "imdb" in folder.lower():
        potential_path = f"{base_path}/{folder}/imdb_movies_main.csv"
        if os.path.exists(potential_path):
            csv_path = potential_path
            break

if csv_path is None:
    csv_path = f"{base_path}/{folders[0]}/imdb_movies_main.csv"


df = pd.read_csv(csv_path)


df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")


df = df.drop_duplicates().reset_index(drop=True)

print("Loaded dataset shape:", df.shape)
display(df.head())


In [None]:
# CELL 2 — preprocessing



# A. Date parsing (your dataset uses 'date_x')
if 'date_x' in df.columns:
    df['release_date'] = pd.to_datetime(df['date_x'], errors='coerce')
    df['release_year'] = df['release_date'].dt.year
else:
    df['release_year'] = np.nan

# B. Convert numeric columns (based on actual dataset)
numeric_cols_present = [
    col for col in ['runtime_min','runtime','budget_x','revenue','popularity',
                    'score','vote_count']
    if col in df.columns
]

for col in numeric_cols_present:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# C. Create final “rating” column (your dataset uses “score”)
if 'score' in df.columns:
    df['rating'] = df['score']
elif 'vote_average' in df.columns:
    df['rating'] = df['vote_average']
else:
    df['rating'] = np.nan

# D. Genres parsing (your dataset uses “genre”)
if 'genre' in df.columns:
    df['genres_list'] = df['genre'].astype(str).apply(
        lambda x: [g.strip().lower() for g in x.split(',')] 
        if pd.notna(x) and x.strip() != '' else []
    )
else:
    df['genres_list'] = [[] for _ in range(len(df))]

# E. Director & Actor extraction
if 'crew' in df.columns:
    df['director_clean'] = df['crew'].astype(str).apply(
        lambda x: x.split(',')[0].strip() if pd.notna(x) and ',' in x else np.nan
    )
    df['actors_list'] = df['crew'].astype(str).apply(
        lambda x: [p.strip() for p in x.split(',')[1:4]]
        if pd.notna(x) and ',' in x else []
    )
else:
    df['director_clean'] = np.nan
    df['actors_list'] = [[] for _ in range(len(df))]

# F. Missing value reporting
missing_report = df.isnull().sum().sort_values(ascending=False)
print("Top missing columns:\n", missing_report.head(10))

# Fill numeric missing with median
numeric_final = df.select_dtypes(include=[np.number]).columns
df[numeric_final] = df[numeric_final].fillna(df[numeric_final].median())

# Fill categorical missing with 'unknown'
categorical_final = df.select_dtypes(include=['object']).columns
df[categorical_final] = df[categorical_final].fillna('unknown')

# G. Outlier Detection using IQR (budget_x, revenue, popularity)
outlier_flags = pd.DataFrame(index=df.index)

for col in ['budget_x','revenue','vote_count','popularity','runtime_min']:
    if col in df.columns:
        series = df[col]
        Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        outlier_flags[col + '_outlier'] = ~series.between(lower, upper)

df = pd.concat([df, outlier_flags], axis=1)

# H. Normalization (MinMax scaling)
scale_features = []
for col in ['runtime_min','budget_x','revenue','popularity','vote_count','rating']:
    if col in df.columns:
        scale_features.append(col)

scaler = MinMaxScaler()
df[[c + '_scaled' for c in scale_features]] = scaler.fit_transform(df[scale_features])

# I. Discretization (ordinal) for rating
if 'rating' in df.columns:
    df['rating_bin'] = pd.qcut(df['rating'], 4, labels=['low','mid','high','very_high'])

# J. PCA (only on scaled numeric columns)
pca_cols = [c + '_scaled' for c in scale_features]

if len(pca_cols) >= 2:
    pca = PCA(n_components=min(3, len(pca_cols)))
    pca_res = pca.fit_transform(df[pca_cols])
    for i in range(pca_res.shape[1]):
        df[f'pca_{i+1}'] = pca_res[:, i]
    print("PCA variance:", pca.explained_variance_ratio_)

# K. Sampling for heavy processing later (limit 5000)
df_sample = df.sample(n=min(5000, len(df)), random_state=42).reset_index(drop=True)

print("Preprocessing complete. Sample shape:", df_sample.shape)


In [None]:
# CELL 3: Basic Graphs & EDA

# A. Rating Distribution
plt.figure(figsize=(8,4))
sns.histplot(df['rating'].dropna(), bins=30, kde=True)
plt.title("IMDb Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()


# ============================
# B. Genre Popularity Over Decades (Corrected)
# ============================

# Safely create release_year
if 'date_x' in df.columns:
    df['release_year'] = pd.to_datetime(df['date_x'], errors='coerce').dt.year
elif 'release_date' in df.columns:
    df['release_year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
elif 'year' in df.columns:
    df['release_year'] = pd.to_numeric(df['year'], errors='coerce')
else:
    df['release_year'] = np.nan

# Clean year
df_year = df.dropna(subset=['release_year']).copy()
df_year['release_year'] = df_year['release_year'].astype(int)

# Explode genres safely
df_exp = df_year.explode('genres_list')
df_exp = df_exp[df_exp['genres_list'].notna() & (df_exp['genres_list'] != '')]

# Create decade
df_exp['decade'] = (df_exp['release_year'] // 10) * 10

# OPTIONAL FIX: Merge incomplete 2020 decade into 2010
df_exp['decade'] = df_exp['decade'].replace({2020: 2010})

# Select top genres
top_genres = df_exp['genres_list'].value_counts().head(8).index

# Trend data
df_trend = df_exp[df_exp['genres_list'].isin(top_genres)]
trend = df_trend.groupby(['decade','genres_list']).size().reset_index(name='count')

# Plot
plt.figure(figsize=(12,6))
sns.lineplot(data=trend, x='decade', y='count', hue='genres_list', marker='o')
plt.title("Genre Popularity Over Decades (Corrected)")
plt.xlabel("Decade")
plt.ylabel("Number of Movies")
plt.legend(title="Genre", bbox_to_anchor=(1.05,1))
plt.tight_layout()
plt.show()


# ============================
# C. Top Genres Bar Chart
# ============================

df_gen = df.explode('genres_list')
df_gen = df_gen[df_gen['genres_list'].notna() & (df_gen['genres_list'] != '')]

top_gen = df_gen['genres_list'].value_counts().head(12)

plt.figure(figsize=(9,6))
sns.barplot(x=top_gen.values, y=top_gen.index)
plt.title("Top Genres")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.show()


# ============================
# D. Runtime vs Rating Scatter
# ============================
if 'runtime_min' in df.columns:
    df_rt = df.dropna(subset=['runtime_min', 'rating'])
    plt.figure(figsize=(7,4))
    sns.scatterplot(data=df_rt, x='runtime_min', y='rating', alpha=0.6)
    plt.title("Runtime vs Rating")
    plt.xlabel("Runtime (min)")
    plt.ylabel("Rating")
    plt.show()


# ============================
# E. Budget vs Rating Scatter
# ============================

budget_col = next((c for c in ['budget', 'budget_x', 'budget_adj', 'budget_in_usd'] if c in df.columns), None)

if budget_col:
    df_b = df.dropna(subset=[budget_col, 'rating'])
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=df_b[budget_col], y=df_b['rating'], alpha=0.6)
    plt.title(f"Budget vs Rating ({budget_col})")
    plt.xlabel(budget_col)
    plt.ylabel("Rating")
    plt.xscale("symlog")  # handles large movie budgets
    plt.show()


# ============================
# F. Top Directors by Avg Rating
# ============================

if 'director_clean' in df.columns:
    df_dir = df[df['director_clean'].notna() & (df['director_clean'] != 'unknown')]
    dir_avg = df_dir.groupby('director_clean')['rating'].mean().sort_values(ascending=False).head(12)

    plt.figure(figsize=(10,6))
    sns.barplot(x=dir_avg.values, y=dir_avg.index)
    plt.title("Top Directors by Average Rating")
    plt.xlabel("Average Rating")
    plt.ylabel("Director")
    plt.show()


# ============================
# G. Top Actors by Movie Count
# ============================

actors_ex = df.explode('actors_list')
actors_ex = actors_ex[actors_ex['actors_list'].notna() & (actors_ex['actors_list'] != '')]

actor_counts = actors_ex['actors_list'].value_counts().head(12)

plt.figure(figsize=(10,6))
sns.barplot(x=actor_counts.values, y=actor_counts.index)
plt.title("Top Actors by Movie Count")
plt.xlabel("Movie Count")
plt.ylabel("Actor")
plt.show()


# ============================
# H. Correlation Heatmap of Scaled Features
# ============================

scaled_cols = [c for c in df.columns if c.endswith('_scaled')]
if len(scaled_cols) >= 2:
    plt.figure(figsize=(8,6))
    sns.heatmap(df[scaled_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Heatmap (Scaled Features)")
    plt.show()


In [None]:
# ================================================================
# CELL 4 — Similarity & Dissimilarity (Fully Corrected)
# ================================================================

sample = df_sample.copy()

# ------------------------------------------------
# 1. NUMERIC FEATURES FOR SIMILARITY (scaled)
# ------------------------------------------------
num_cols = []
for col in ['runtime_min', 'budget_x', 'revenue', 'popularity', 'vote_count', 'rating']:
    c_scaled = col + "_scaled"
    if c_scaled in sample.columns:
        num_cols.append(c_scaled)

if num_cols:
    X_num = sample[num_cols].fillna(0).to_numpy()
else:
    X_num = np.zeros((len(sample), 0))

# ------------------------------------------------
# 2. GENRE BINARY ENCODING (top 20 genres)
# ------------------------------------------------
all_genres = Counter([g for lst in sample['genres_list'] for g in lst])
top_genres = [g for g, _ in all_genres.most_common(20)]

mlb = MultiLabelBinarizer(classes=top_genres)
genre_bin = mlb.fit_transform(sample['genres_list'])

# Final similarity matrix input
X_sim = np.hstack([X_num, genre_bin]) if X_num.size else genre_bin

print("Numeric + Genre Matrix Shape:", X_sim.shape)

# ------------------------------------------------
# 3. Pairwise Euclidean, Manhattan, Cosine
# ------------------------------------------------
n_show = min(6, X_sim.shape[0])  # show first few rows only

pair_eu = pairwise_distances(X_sim[:n_show], metric="euclidean")
pair_man = pairwise_distances(X_sim[:n_show], metric="manhattan")
pair_cos = 1 - pairwise_distances(X_sim[:n_show], metric="cosine")  # similarity

print("\nEuclidean Distance:\n", np.round(pair_eu, 3))
print("\nManhattan Distance:\n", np.round(pair_man, 3))
print("\nCosine Similarity:\n", np.round(pair_cos, 3))

# ------------------------------------------------
# 4. Jaccard & SMC (on genre binary only)
# ------------------------------------------------
def jaccard_matrix(M):
    n = M.shape[0]
    J = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            inter = np.logical_and(M[i], M[j]).sum()
            union = np.logical_or(M[i], M[j]).sum()
            J[i, j] = inter / union if union > 0 else 0.0
    return J

def smc_matrix(M):
    n = M.shape[0]
    S = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            S[i, j] = (M[i] == M[j]).sum() / len(M[i])
    return S

if genre_bin.size:
    print("\nJaccard Similarity (Genres):\n", np.round(jaccard_matrix(genre_bin[:n_show]), 3))
    print("\nSMC (Genres):\n", np.round(smc_matrix(genre_bin[:n_show]), 3))

# ------------------------------------------------
# 5. Set Difference (Example between first 2 movies)
# ------------------------------------------------
if len(sample) >= 2:
    g0 = set(sample.iloc[0]['genres_list'])
    g1 = set(sample.iloc[1]['genres_list'])
    print("\nSet Difference (Movie0 - Movie1):", g0 - g1)


In [None]:
# ================================================================
# CELL 5 — Classification (Corrected & Dataset-Safe)
# ================================================================

clf_sample = df_sample.copy()

# ------------------------------------------------
# 1. Feature Construction (numeric_scaled + genre binary)
# ------------------------------------------------

# Numeric scaled columns from Cell 2
feat_num = [c for c in clf_sample.columns if c.endswith("_scaled")]


top_gen = list(all_genres.keys())[:20] 
mlb = MultiLabelBinarizer(classes=top_gen)
genre_bin = mlb.fit_transform(clf_sample["genres_list"])

# Add genre columns
for i, g in enumerate(top_gen):
    clf_sample[f"genre_{g}"] = genre_bin[:, i]

feat_cols = feat_num + [f"genre_{g}" for g in top_gen]

print("Total feature columns:", len(feat_cols))


X = clf_sample[feat_cols].fillna(0)

# Convert rating_bin to numeric labels 0–3
y = clf_sample["rating_bin"].astype("category").cat.codes


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


knn = KNeighborsClassifier(n_neighbors=5, weights="distance")
knn_cv_scores = cross_val_score(knn, X_train, y_train, cv=cv, scoring="accuracy")

knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("\n===== KNN Weighted Classification =====")
print("KNN CV Mean Accuracy:", knn_cv_scores.mean())
print("KNN Test Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


for crit in ["gini", "entropy"]:
    dt = DecisionTreeClassifier(
        criterion=crit,
        max_depth=6,
        min_samples_leaf=10,
        random_state=42
    )

    dt_cv_scores = cross_val_score(dt, X_train, y_train, cv=cv, scoring="accuracy")
    dt.fit(X_train, y_train)
    preds_dt = dt.predict(X_test)

    print(f"\n===== Decision Tree ({crit.upper()}) =====")
    print("CV Mean Accuracy:", dt_cv_scores.mean())
    print("Test Accuracy:", accuracy_score(y_test, preds_dt))
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds_dt))


dt_deep = DecisionTreeClassifier(random_state=42)
dt_deep.fit(X_train, y_train)

train_acc = accuracy_score(y_train, dt_deep.predict(X_train))
test_acc = accuracy_score(y_test, dt_deep.predict(X_test))

print("\n===== Overfitting Check =====")
print("Deep Tree Train Accuracy:", train_acc)
print("Deep Tree Test Accuracy:", test_acc)
print("(Large difference = Overfitting)")


In [None]:
# ================================================================
# CELL 6 — Association Rule Mining (Apriori)
# ================================================================

transactions = [tuple(sorted(set(g))) for g in df_sample["genres_list"] if g]

def apriori(transactions, min_support=0.02):
    n = len(transactions)
    L = {}

    # C1 candidate 1-itemsets
    item_counts = Counter(itertools.chain.from_iterable(transactions))
    L1 = {(i,): c/n for i, c in item_counts.items() if c/n >= min_support}
    L.update(L1)

    current_L = set(L1.keys())
    k = 1

    while current_L:
        candidates = set()
        curr = sorted(current_L)

        # join step
        for a in curr:
            for b in curr:
                union = tuple(sorted(set(a) | set(b)))
                if len(union) == k + 1:
                    candidates.add(union)

        # count support
        cand_support = {}
        for cand in candidates:
            cnt = sum(1 for t in transactions if set(cand).issubset(t))
            sup = cnt / n
            if sup >= min_support:
                cand_support[cand] = sup

        L.update(cand_support)
        current_L = set(cand_support.keys())
        k += 1

    return L


# Run apriori
freq_itemsets = apriori(transactions, min_support=0.02)

freq_df = (
    pd.DataFrame([(k, v) for k, v in freq_itemsets.items()],
                 columns=["itemset", "support"])
    .sort_values("support", ascending=False)
)

print("Frequent itemsets:")
display(freq_df.head(20))


# =====================
# Create Association Rules
# =====================
def generate_rules(freq_itemsets, transactions, min_conf=0.3):
    n = len(transactions)
    rules = []

    for itemset in freq_itemsets:
        if len(itemset) < 2:
            continue

        sup_item = freq_itemsets[itemset]

        # generate all proper subsets
        subsets = [
            tuple(sorted(s))
            for i in range(1, len(itemset))
            for s in itertools.combinations(itemset, i)
        ]

        for A in subsets:
            B = tuple(sorted(set(itemset) - set(A)))
            sup_A = freq_itemsets.get(A, sum(
                1 for t in transactions if set(A).issubset(t)) / n)
            if sup_A == 0:
                continue

            conf = sup_item / sup_A

            sup_B = freq_itemsets.get(B, sum(
                1 for t in transactions if set(B).issubset(t)) / n)

            lift = conf / sup_B if sup_B else np.nan

            if conf >= min_conf:
                rules.append({
                    "antecedent": A,
                    "consequent": B,
                    "support": sup_item,
                    "confidence": conf,
                    "lift": lift
                })

    if rules:
        return pd.DataFrame(rules).sort_values(["lift", "confidence"], ascending=False)
    return pd.DataFrame([])


rules_df = generate_rules(freq_itemsets, transactions, min_conf=0.3)

print("Top Association Rules:")
display(rules_df.head(20))


# Closed and maximal itemsets
closed = []
maximal = []

for s in freq_itemsets:
    is_closed = True
    is_max = True
    for t in freq_itemsets:
        if set(s).issubset(set(t)) and s != t:
            if freq_itemsets[t] == freq_itemsets[s]:
                is_closed = False
            is_max = False
    if is_closed:
        closed.append(s)
    if is_max:
        maximal.append(s)

print("Closed frequent sets (sample):", closed[:10])
print("Maximal frequent sets (sample):", maximal[:10])


In [None]:
# ================================================================
# CELL 7 — Clustering: KMeans (Elbow, Silhouette) + DBSCAN
# ================================================================


cluster_num_cols = [c for c in df_sample.columns if c.endswith("_scaled")]


top_gen_cluster = list(all_genres.keys())[:20]
mlb_cluster = MultiLabelBinarizer(classes=top_gen_cluster)
genre_bin_cluster = mlb_cluster.fit_transform(df_sample["genres_list"])


X_cluster = np.hstack([
    df_sample[cluster_num_cols].fillna(0).to_numpy(),
    genre_bin_cluster
])

print("Clustering matrix shape:", X_cluster.shape)


# ---------------------------------------------------------------
# 1. Elbow Method for KMeans
# ---------------------------------------------------------------
inertia = []
K_range = range(2, 9)

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_cluster)
    inertia.append(km.inertia_)

plt.figure(figsize=(7, 4))
plt.plot(list(K_range), inertia, marker="o")
plt.title("Elbow Method for KMeans")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()


# ---------------------------------------------------------------
# 2. Silhouette Score for a chosen k
# ---------------------------------------------------------------
k_choice = 4 if X_cluster.shape[0] >= 4 else 2

km = KMeans(n_clusters=k_choice, random_state=42, n_init=10)
labels_km = km.fit_predict(X_cluster)

if len(set(labels_km)) > 1:  # silhouette requires at least 2 clusters
    sil = silhouette_score(X_cluster, labels_km)
    print(f"Silhouette Score (k={k_choice}):", sil)
else:
    print("Silhouette cannot be computed (only one cluster found).")


# ---------------------------------------------------------------
# 3. DBSCAN clustering
# ---------------------------------------------------------------
db = DBSCAN(eps=0.5, min_samples=5).fit(X_cluster)
labels_db = db.labels_

n_clusters = len(set(labels_db)) - (1 if -1 in labels_db else 0)
n_noise = list(labels_db).count(-1)

print(f"DBSCAN: {n_clusters} clusters found")
print(f"DBSCAN: {n_noise} noise points detected")


In [None]:
# CELL 8: Anomaly Detection (3-sigma, KNN proximity-based, DBSCAN noise). Evaluation template (confusion matrix)


an_col = 'revenue' if 'revenue' in df.columns else ('budget' if 'budget' in df.columns else None)
if an_col:
    s = pd.to_numeric(df[an_col].fillna(0))
    mu, sigma = s.mean(), s.std()
    ultra_upper = mu + 3*sigma; ultra_lower = mu - 3*sigma
    an3_idx = df[(s>ultra_upper) | (s<ultra_lower)].index
    print(f"3-sigma anomalies on {an_col}: {len(an3_idx)} rows (sample):")
    display(df.loc[an3_idx, ['names', an_col]].head())

# Proximity-based: KNN mean distance as anomaly score (use X_cluster)
nbrs = NearestNeighbors(n_neighbors=5).fit(X_cluster)
dists, idxs = nbrs.kneighbors(X_cluster)
anom_scores = dists.mean(axis=1)
threshold = np.percentile(anom_scores, 95)
knn_anom_idx = np.where(anom_scores > threshold)[0]
print("KNN proximity anomalies (top 5 sample count):", len(knn_anom_idx))
print("Sample anomaly scores (top 5 indices):", knn_anom_idx[:5])

# Clustering-based: DBSCAN noise (labels_db from clustering cell)
if 'labels_db' in globals():
    db_noise_idx = np.where(labels_db == -1)[0]
    print("DBSCAN noise points count:", len(db_noise_idx))

# Evaluation note: if ground-truth anomaly labels existed (y_true binary), compute:
# from sklearn.metrics import confusion_matrix
# cm = confusion_matrix(y_true, y_pred); print(cm)


In [None]:
# CELL 9: Director helper (dataset-safe) + save cleaned datasets

import matplotlib.pyplot as plt
from IPython.display import display

def show_top3_director(director_name, df_full=df):
    # Normalize director column and input
    if 'director_clean' not in df_full.columns:
        print("No 'director_clean' column in dataset.")
        return

    director_mask = df_full['director_clean'].astype(str).str.strip().str.lower() == director_name.strip().lower()
    sub = df_full[director_mask].copy()

    if sub.empty:
        print("No movies for director:", director_name)
        return

    # Choose a release column to display
    if 'release_year' in sub.columns:
        release_col = 'release_year'
    elif 'release_date' in sub.columns:
        release_col = 'release_date'
    elif 'date_x' in sub.columns:
        release_col = 'date_x'
    elif 'year' in sub.columns:
        release_col = 'year'
    else:
        release_col = None

    # Prefer runtime_min, else runtime, else try to extract numeric runtime
    if 'runtime_min' in sub.columns:
        runtime_col = 'runtime_min'
    elif 'runtime' in sub.columns:
        runtime_col = 'runtime'
    else:
        runtime_col = None

    # Ensure rating exists
    if 'rating' not in sub.columns:
        print("No 'rating' column found to sort by.")
        return

    # Sort by rating (desc) then by release_year (desc) for tie-break
    sort_cols = ['rating']
    if release_col:
        sort_cols.append(release_col)
    sub = sub.sort_values(by=sort_cols, ascending=[False] * len(sort_cols))

    top3 = sub.head(3).copy()

    # Prepare display columns (safe picks)
    display_cols = []
    if 'names' in top3.columns:
        display_cols.append('names')
    if release_col:
        display_cols.append(release_col)
    display_cols.append('rating')
    if runtime_col:
        display_cols.append(runtime_col)

    print(f"Top {len(top3)} movies for director: {director_name}")
    display(top3[display_cols])

    # Plot: bar for rating, line for runtime (if runtime exists)
    fig, ax1 = plt.subplots(figsize=(9,5))
    names = top3['names'].astype(str).tolist()

    # Bar: ratings
    ax1.bar(names, top3['rating'].astype(float), alpha=0.7)
    ax1.set_ylabel('Rating')
    ax1.set_ylim(0, max(10, top3['rating'].max() * 1.1))  # assume rating scale up to 10

    # Line: runtime on secondary axis if available and numeric
    if runtime_col:
        try:
            runtimes = pd.to_numeric(top3[runtime_col], errors='coerce')
            ax2 = ax1.twinx()
            ax2.plot(names, runtimes, color='C1', marker='o', linewidth=2)
            ax2.set_ylabel('Runtime (min)')
        except Exception:
            pass

    ax1.set_title(f"Top {len(top3)} Movies for {director_name} — Rating & Runtime")
    plt.xticks(rotation=30, ha='right')
    plt.tight_layout()
    plt.show()


# Save cleaned datasets (full and sampled) to Kaggle working directory
full_path = "/kaggle/working/imdb_cleaned_full.csv"
sample_path = "/kaggle/working/imdb_cleaned_sample.csv"

# Ensure df and df_sample exist (they should from preprocessing cells)
if 'df' in globals():
    df.to_csv(full_path, index=False)
    print(f"Saved cleaned full dataset to: {full_path}")
else:
    print("Warning: 'df' not found, full dataset not saved.")

if 'df_sample' in globals():
    df_sample.to_csv(sample_path, index=False)
    print(f"Saved cleaned sample dataset to: {sample_path}")
else:
    print("Warning: 'df_sample' not found, sample dataset not saved.")
