<a href="https://colab.research.google.com/github/AayushMandavia/Mavon/blob/main/Movie_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor
import joblib

In [35]:
movies = pd.read_csv('/content/drive/MyDrive/Machine Learning/movies.csv')
movies.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,"runtime,,"
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772.0,Warner Bros.,"146.0,"
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106.0,Columbia Pictures,"104.0,"
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067.0,Lucasfilm,"124.0,"
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539.0,Paramount Pictures,"88.0,"
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344.0,Orion Pictures,"98.0,"


In [36]:
movies.columns = movies.columns.str.strip().str.lower()
if 'runtime,,' in movies.columns:
    movies.rename(columns={'runtime,,':'runtime'}, inplace=True)
movies.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772.0,Warner Bros.,"146.0,"
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106.0,Columbia Pictures,"104.0,"
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067.0,Lucasfilm,"124.0,"
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539.0,Paramount Pictures,"88.0,"
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344.0,Orion Pictures,"98.0,"


In [37]:
movies.dtypes

Unnamed: 0,0
name,object
rating,object
genre,object
year,int64
released,object
score,float64
votes,int64
director,object
writer,object
star,object


In [38]:
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce')
movies['gross'] = pd.to_numeric(movies['gross'], errors='coerce')
movies['votes'] = pd.to_numeric(movies['votes'], errors='coerce')
movies['runtime'] = pd.to_numeric(movies['runtime'], errors='coerce')
movies['score'] = pd.to_numeric(movies['score'], errors='coerce')
movies['released'] = pd.to_datetime(movies['released'], errors='coerce')

  movies['released'] = pd.to_datetime(movies['released'], errors='coerce')


In [39]:
movies = movies.dropna(subset=['budget', 'gross'])

In [40]:
movies['release_year'] = movies['released'].dt.year
movies['release_month'] = movies['released'].dt.month
movies['release_weekday'] = movies['released'].dt.weekday
movies['title_length'] = movies['name'].astype(str).apply(len)
movies['profit_ratio'] = movies['gross'] / (movies['budget'] + 1)

In [41]:
def classify(row):
    if row['gross'] >= 2 * row['budget']:
        return 'Hit'
    elif row['gross'] < 0.5 * row['budget']:
        return 'Flop'
    else:
        return 'Average'

movies['success'] = movies.apply(classify, axis=1)

In [42]:
movies.head()
movies['success'].value_counts()
movies.isna().sum()

Unnamed: 0,0
name,0
rating,27
genre,0
year,0
released,3831
score,0
votes,0
director,0
writer,1
star,0


In [43]:
y_reg = movies['gross']

In [44]:
y_clf = movies['success']

In [45]:
features = [
    'budget',
    'score',
    'votes',
    'runtime',
    'release_year',
    'release_month',
    'title_length'
]

X = movies[features]

In [46]:
from sklearn.model_selection import train_test_split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

In [47]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
reg_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)
reg_model.fit(X_train_reg, y_train_reg)

In [48]:
y_pred_reg = reg_model.predict(X_test_reg)
rmse = mean_squared_error(y_test_reg, y_pred_reg) ** 0.5
r2 = r2_score(y_test_reg, y_pred_reg)
print("RMSE:", rmse)
print("R² Score:", r2)


RMSE: 80353888.43751803
R² Score: 0.4883328944205123


In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

clf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

clf_model.fit(X_train_clf, y_train_clf)

In [50]:
y_pred_clf = clf_model.predict(X_test_clf)
print(classification_report(y_test_clf, y_pred_clf))
print(confusion_matrix(y_test_clf, y_pred_clf))

              precision    recall  f1-score   support

     Average       0.52      0.48      0.50       185
        Flop       0.61      0.53      0.57       131
         Hit       0.83      0.89      0.86       451

    accuracy                           0.73       767
   macro avg       0.65      0.63      0.64       767
weighted avg       0.72      0.73      0.72       767

[[ 88  32  65]
 [ 44  69  18]
 [ 36  12 403]]


In [51]:
import joblib
import pandas as pd
reg_model = joblib.load("/content/drive/MyDrive/Movie Prediction/revenue_model.pkl")
clf_model = joblib.load("/content/drive/MyDrive/Movie Prediction/success_model.pkl")
def predict_movie_success(budget, score, votes, runtime, release_year, release_month, title):
    df = pd.DataFrame([{
        'budget': budget,
        'score': score,
        'votes': votes,
        'runtime': runtime,
        'release_year': release_year,
        'release_month': release_month,
        'title_length': len(title)
    }])
    predicted_gross = reg_model.predict(df)[0]
    predicted_class = clf_model.predict(df)[0]
    return predicted_gross, predicted_class

In [52]:
gross, label = predict_movie_success(
    budget = 150000000,
    score = 7.8,
    votes = 250000,
    runtime = 140,
    release_year = 2024,
    release_month = 5,
    title = "Future Action Movie"
)
print("Predicted Gross:", gross)
print("Predicted Success:", label)

Predicted Gross: 264499410.0
Predicted Success: Hit


In [10]:
import joblib
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import time

reg_model = joblib.load("/content/drive/MyDrive/Movie Prediction/revenue_model.pkl")
clf_model = joblib.load("/content/drive/MyDrive/Movie Prediction/success_model.pkl")

def predict_movie(budget, score, votes, runtime, release_year, release_month, title):
    df = pd.DataFrame([{
        "budget": float(budget),
        "score": max(0, min(10, float(score))),
        "votes": max(0, min(100, int(votes))),
        "runtime": int(runtime),
        "release_year": int(release_year),
        "release_month": int(release_month),
        "title_length": len(title)
    }])

    predicted_gross = reg_model.predict(df)[0]
    predicted_class = clf_model.predict(df)[0]

    return predicted_gross, predicted_class

title = widgets.Text(description="Title")
budget = widgets.FloatText(description="Budget")
score = widgets.FloatSlider(description="Score", min=0, max=10, step=0.1)
votes = widgets.IntSlider(description="Votes", min=0, max=100)
runtime = widgets.IntText(description="Runtime")
release_year = widgets.IntText(description="Year")
release_month = widgets.IntSlider(description="Month", min=1, max=12)

predict_btn = widgets.Button(description="Predict", button_style='danger')
output = widgets.HTML()

def on_predict_clicked(b):
    output.value = "<p>Predicting...</p>"
    time.sleep(1)

    gross, cls = predict_movie(
        budget.value, score.value, votes.value,
        runtime.value, release_year.value, release_month.value, title.value
    )

    output.value = f"""
    <h3>Prediction Result</h3>
    <p><b>Predicted Gross:</b> ${gross:,.2f}</p>
    <p><b>Success Class:</b> {cls}</p>
    """

predict_btn.on_click(on_predict_clicked)

display(
    widgets.VBox([
        title, budget, score, votes, runtime, release_year, release_month,
        predict_btn, output
    ])
)

VBox(children=(Text(value='', description='Title'), FloatText(value=0.0, description='Budget'), FloatSlider(va…