# Movie Success Prediction

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
import warnings
import pickle

warnings.filterwarnings('ignore')

## 2. Load Data

In [None]:
df = pd.read_csv('movie_metadata.csv')
print(df.shape)
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Check for missing values
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

print(df.isnull().sum())

In [None]:
# Target Variable Distribution
plt.figure(figsize=(8,5))
sns.histplot(df['imdb_score'], bins=20, kde=True)
plt.title("Distribution of IMDB Scores")
plt.show()

In [None]:
# Correlation Heatmap (Numerical columns)
plt.figure(figsize=(12,10))
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

## 4. Data Preprocessing & Feature Engineering

In [None]:
# 1. Drop duplicates
df.drop_duplicates(inplace=True)

# 2. Handle Missing Values
# Numeric: Median imputation
num_cols = df.select_dtypes(include=[np.number]).columns
imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])

# Categorical: Mode imputation (if any needed, though we drop many)
cat_cols = df.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

# 3. Create Target Variable 'Classify'
# [1-3] -> Flop, [3-6] -> Average, [6-10] -> Hit
def classify_movie(score):
    if score < 3:
        return 'Flop'
    elif score < 6:
        return 'Average'
    else:
        return 'Hit'

df['Classify'] = df['imdb_score'].apply(classify_movie)

print(df['Classify'].value_counts())

# 4. Feature Selection
# User requested specific inputs for app: genre, imdb_score, fb_likes, budget, year
# We will train the model using: 'genres', 'cast_total_facebook_likes', 'budget', 'title_year'
# Note: We drop 'imdb_score' from X because it is the source of y (Leakage)

# Simplify Genre: Take the first genre listed
df['main_genre'] = df['genres'].apply(lambda x: x.split('|')[0] if isinstance(x, str) else x)

# Select Features
features = ['main_genre', 'cast_total_facebook_likes', 'budget', 'title_year']
X = df[features]
y = df['Classify']

# 5. Encoding
le_genre = LabelEncoder()
X['main_genre'] = le_genre.fit_transform(X['main_genre'])

# Save LabelEncoder for App
with open('label_encoder_genre.pkl', 'wb') as f:
    pickle.dump(le_genre, f)

# 6. Scaling (Optional for RF but good for others)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save Scaler for App
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

## 5. Model Development

In [None]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

results = {}

# Train and Evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print("-"*30)

# Visualize Results
plt.figure(figsize=(8,5))
plt.bar(results.keys(), results.values(), color=['blue', 'green', 'orange'])
plt.title("Model Comparison - Accuracy")
plt.ylabel("Accuracy")
plt.show()

## 6. Final Model Saving

In [None]:
# We'll use Random Forest as the final model for the app
final_model = models['Random Forest']
with open('model.pkl', 'wb') as f:
    pickle.dump(final_model, f)
print("Model saved as model.pkl")