In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your dataset
df = pd.read_csv('final_dataset.csv')

# --- Feature Engineering ---

# Convert release_date to datetime and calculate recency (in days)
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
today = pd.to_datetime('2025-06-08')
df['recency_days'] = (today - df['release_date']).dt.days
df['recency_days'] = df['recency_days'].fillna(df['recency_days'].max())
df['recency_norm'] = 1 - (df['recency_days'] / df['recency_days'].max())

# Log-transform budget and revenue for scaling
df['budget_log'] = np.log1p(df['budget'])
df['revenue_log'] = np.log1p(df['revenue'])

# Normalize popularity
df['popularity_norm'] = (df['popularity'] - df['popularity'].min()) / (df['popularity'].max() - df['popularity'].min())

# TF-IDF on keywords (simple sum as a proxy for 'appealing' keywords)
tfidf = TfidfVectorizer(max_features=30)
tfidf_matrix = tfidf.fit_transform(df['keywords'].fillna(''))
df['keywords_score'] = tfidf_matrix.sum(axis=1).A1
df['keywords_norm'] = (df['keywords_score'] - df['keywords_score'].min()) / (df['keywords_score'].max() - df['keywords_score'].min())

# --- Synthetic Piracy Risk Target ---
# (Weighted sum as a proxy target for ML)
df['piracy_risk_proxy'] = (
    0.3 * df['budget_log'] +
    0.25 * df['revenue_log'] +
    0.25 * df['popularity_norm'] +
    0.15 * df['recency_norm'] +
    0.05 * df['keywords_norm']
)
df['piracy_risk_proxy'] = (df['piracy_risk_proxy'] - df['piracy_risk_proxy'].min()) / (df['piracy_risk_proxy'].max() - df['piracy_risk_proxy'].min())

# --- ML Model Training ---

feature_cols = ['budget_log', 'revenue_log', 'popularity_norm', 'recency_norm', 'keywords_norm']
X = df[feature_cols]
y = df['piracy_risk_proxy']

# Split for demonstration (in practice, use all data for prediction)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict piracy risk for all movies
df['piracy_risk_pred'] = model.predict(X)

# --- Filter and Save High-Risk Movies ---
high_risk = df[df['piracy_risk_pred'] > 0.8].sort_values('piracy_risk_pred', ascending=False)
high_risk.to_csv('high_piracy_risk_ml_movies.csv', index=False)

print(f"High-risk movies found: {len(high_risk)}")
print(high_risk[['title', 'piracy_risk_pred', 'budget', 'revenue', 'popularity', 'release_date']].head(10))
high_risk.to_csv('high_piracy_risk_ml_movies.csv', index=False)
print("Saved high_piracy_risk_ml_movies.csv")

High-risk movies found: 427
                                          title  piracy_risk_pred     budget  \
0                      Avatar: The Way of Water          0.991149  460000000   
4                             Avengers: Endgame          0.981311  356000000   
7                             Star Light Exodus          0.971110  312000000   
8                        Avengers: Infinity War          0.963720  300000000   
3                       Avengers: Age of Ultron          0.954820  365000000   
5                       Denis Villeneuve's Dune          0.951412  355000000   
14                                The Lion King          0.946924  260000000   
6                                        Fast X          0.942153  340000000   
2   Pirates of the Caribbean: On Stranger Tides          0.942096  379000000   
67                      Spider-Man: No Way Home          0.940921  200000000   

       revenue  popularity release_date  
0   2320250281    241.2850   2022-12-14  
4   280