In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from ipywidgets import interact, widgets

In [5]:
final_cleaned_data = pd.read_csv('final_cleaned_data.csv')

In [6]:
# Ensure the data type for categorical variables
final_cleaned_data['Budget'] = final_cleaned_data['Budget'].replace('N/A', 0).astype(float)

# Convert imdbRating to string if it's not already
final_cleaned_data['imdbRating'] = final_cleaned_data['imdbRating'].astype(str)
# Replace 'N/A' with '0' and commas with dots, then convert to float
final_cleaned_data['imdbRating'] = final_cleaned_data['imdbRating'].replace('N/A', '0').str.replace(',', '.').astype(float)

final_cleaned_data['BoxOffice'] = final_cleaned_data['BoxOffice'].replace('N/A', 0).astype(float)

# Feature transformation
features = final_cleaned_data[['Budget', 'Director', 'Actors', 'Genre']]
target = final_cleaned_data[['imdbRating', 'BoxOffice']]

column_transformer = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Director', 'Actors', 'Genre']),
    ('num', StandardScaler(), ['Budget'])
])

# Model for IMDb Rating
imdb_pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('model', LinearRegression())
])

imdb_pipeline.fit(features, target['imdbRating'])

# Model for BoxOffice
box_office_pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('model', LinearRegression())
])

box_office_pipeline.fit(features, target['BoxOffice'])

# Widgets for interactive input
budget_widget = widgets.FloatSlider(description='Budget', min=100000, max=1000000000, step=50000)
director_widget = widgets.Text(description='Director')
actors_widget = widgets.Text(description='Actors')
genre_widget = widgets.Text(description='Genre')

# Interactive function with visualization
def predict_and_visualize(Budget, Director, Actors, Genre):
    # Prepare data for prediction
    input_data = pd.DataFrame({'Budget': [Budget], 'Director': [Director], 'Actors': [Actors], 'Genre': [Genre]})
    
    # Predict IMDb Rating
    imdb_pred = imdb_pipeline.predict(input_data)
    
    # Predict BoxOffice Revenue
    boxoffice_pred = box_office_pipeline.predict(input_data)
    
    # Output predictions
    print(f"Predicted IMDb Rating: {imdb_pred[0]:.2f}")
    print(f"Predicted Box Office Revenue: ${boxoffice_pred[0]:,.2f}")
    
    # Plotting the actual vs predicted data
    plt.figure(figsize=(14, 6))
    
    # Actual IMDb ratings vs Budget
    plt.subplot(1, 2, 1)
    plt.scatter(final_cleaned_data['Budget'], final_cleaned_data['imdbRating'], alpha=0.5, label='Actual Data')
    plt.scatter(Budget, imdb_pred, color='red', label='Prediction', s=100)
    plt.title('IMDb Rating vs Budget')
    plt.xlabel('Budget ($)')
    plt.ylabel('IMDb Rating')
    plt.legend()
    
    # Actual Box Office vs Budget
    plt.subplot(1, 2, 2)
    plt.scatter(final_cleaned_data['Budget'], final_cleaned_data['BoxOffice'], alpha=0.5, label='Actual Data')
    plt.scatter(Budget, boxoffice_pred, color='red', label='Prediction', s=100)
    plt.title('Box Office vs Budget')
    plt.xlabel('Budget ($)')
    plt.ylabel('Box Office Revenue ($)')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# Interact function with visualization
interact(predict_and_visualize,
         Budget=budget_widget,
         Director=director_widget,
         Actors=actors_widget,
         Genre=genre_widget);


interactive(children=(FloatSlider(value=100000.0, description='Budget', max=1000000000.0, min=100000.0, step=5…