In [None]:
import pandas as pd
import utils.fetcher_utils as fetcher
import utils.pipeline_util as pu
import utils.transformer_util as tu
from utils.prediction_builder import PredictionDFBuilder
import numpy as np
import IMDb_Predictor_GUI_II as gui
import tkinter as tk
from tkinter import ttk

In [None]:
%gui tk

In [None]:
# set pandas options
pd.set_option('display.max_columns', None)

# Data Model Implementation
## Data Extraction
### Source: Keggle: [5000 Movies Database](https://www.kaggle.com/datasets/carolzhangdc/imdb-5000-movie-dataset)

In [None]:
# fetch the data 
df = fetcher.aquireIMDbDataFrame()
df.head()

## Cleaning
- Data cleaning for missing values are implemented in `utils.preprocess_util.py`
- Data transformation like aggregation of columns and encoding implemented in `utils.preprocess_util.py`

## Export
- Preprocessed data is persisted at `resources/preprocessed_df.csv`

## Training Models
- Multiple model training implemented in `utils.trainer_util.py`
- Model score evaluation and data split implemented in `utils.pipeline_util.py`
- Pipelines are designed to select best model based on Adjusted R-squared value
- Further analysis on model performances are implemented in `model_performance_analysis.ipynb` notebook

In [None]:
# run the pipeline and get the model and predictions
# pipeline takes care of cleaning, transforming, and splitting the data clean data is returned as a DataFrame
model, best_y_pred, preprocessed_df = pu.run_pipeline(data=df, use_PCA=False, debug=False)

# persist preprocessed data for further analysis
pd.to_csv(preprocessed_df, '../resources/preprocessed_df.csv')

## Prediction Scores
- Based on the Adjusted R-squared score this program primarily uses `GradientBoostRegressor`
- Initial Adjusted R-squared value was observed at `~0.56` as described in [this chart](../resources/img/model_numeric.png)
- After Optimizations Adjusted R-squared value was observed at `~.60` as described in [this chart](../resources/img/model_preprocessed.png)

In [None]:
# describe the predictions
pd.Series(best_y_pred).describe()

In [None]:
# create a prediction dataframe for testing
builder = PredictionDFBuilder(df)
prediction_df = (
    builder
    .add_actor_1("Clint Eastwood")
    .add_actor_2("Meryl Streep")
    .add_actor_3("Tom Hanks")
    .add_director("Gore Verbinski")
    .add_rating("PG-13")
    .add_genre("Thriller")
    .build()
)

prediction_df


In [None]:
# predict the test data without GUI
prediction = model.predict(prediction_df)
print(f"Predicted IMDb Score: {prediction[0]}")

In [None]:
# run the GUI to get user input and display the prediction
root = tk.Tk()
gui = gui.PredictionDFBuilderGUI(root, df, model)
root.mainloop()
