In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

In [None]:
import pandas as pd
import utils.fetcher_utils as fetcher
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_text, plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# set pandas options
pd.set_option('display.max_columns', None)


In [None]:
# fetch the data 
df = fetcher.aquireIMDbDataFrame("../../resources")
df.head()


In [None]:
# drop columns that are not useful for the model
drop_columns = ['movie_imdb_link']
df_filtered = df.drop(columns=drop_columns)

In [None]:
# drop rows with missing values
df_filtered = df_filtered.dropna(subset='title_year')
df_filtered.head()

In [None]:
# This code processes a DataFrame (imdb_df_filtered) to transform the 'genres' column,
# which contains pipe-separated genre strings, into a one-hot encoded format. 
# The 'genres' column is first converted to a string type and split into lists of individual genres.
# These lists are then exploded into multiple rows, with each genre represented separately.
# One-hot encoding is applied to create binary columns for each unique genre.
# Finally, the rows are grouped back by their original indices, and the one-hot encoded columns
# are aggregated to ensure all genres for a single movie are captured in one row.
# The transformed genre columns are concatenated with the rest of the original DataFrame
# (excluding the original 'genres' column), resulting in a new DataFrame where each genre
# is represented as a binary column (1 for presence, 0 for absence).

genres = df_filtered
genres['genres'] = genres['genres'].astype(str)
genres['genres'] = genres['genres'].str.split('|')
genre_dummies = genres['genres'].explode().str.get_dummies().groupby(level=0).max()
# columns = genre_dummies.columns.tolist()
# columns
df_filtered = pd.concat([genres.drop(columns=['genres']), genre_dummies], axis=1)
df_filtered.head()

In [None]:
director_dummies = df_filtered['director_name'].str.get_dummies()
director_dummies.head()

In [None]:
# This code performs one-hot encoding on the 'content_rating' column of the 
# DataFrame (imdb_df_filtered). Each unique value in the 'content_rating' 
# column is transformed into a separate binary column, where:
#   - A value of 1 indicates the presence of that specific content rating for the row.
#   - A value of 0 indicates its absence.
# 
# The one-hot encoded binary columns are stored in the `content_rating` DataFrame. 
# These columns are then concatenated with the original `imdb_df_filtered` DataFrame, 
# effectively adding the one-hot encoded columns to the existing data. 


content_rating = df_filtered['content_rating'].str.get_dummies()
df_filtered = pd.concat([df_filtered, content_rating], axis=1)
df_filtered.head()

In [None]:
# This code calculates the experience of each director based on the number of movies 
# they have directed in the DataFrame (imdb_df_filtered). It performs the following steps:
# 1. Groups the DataFrame by the 'director_name' column and counts the occurrences 
#    of each director, representing their total number of directed movies.
#    The result is stored in `director_experience`, where the index is the director's name 
#    and the value is their movie count.
# 2. Maps the `director_experience` values back to the `imdb_df_filtered` DataFrame 
#    by assigning the corresponding movie count (experience) to a new column, 
#    `director_experience`, for each director in the dataset.
# 
# This process adds a new column, 'director_experience', that quantifies the number 
# of movies each director has directed, providing useful information for further analysis.


director_experience = df_filtered.groupby('director_name')['director_name'].count()
director_experience.head(10)
df_filtered['director_experience'] = df_filtered['director_name'].map(director_experience)
df_filtered.head()

In [None]:
# drop all non-numeric columns
df_filtered = df_filtered.select_dtypes(include=['number'])



In [None]:
# Split the data into features (X) and target (y)
X = df_filtered.drop(columns=['imdb_score'])
y = df_filtered['imdb_score']

df_filtered = StandardScaler().fit_transform(df_filtered)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Scale the features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Train a Random Forest Regressor model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [None]:
# Evaluate the model using Mean Squared Error and R-squared
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


In [None]:


# Plot feature importance
importance = model.feature_importances_
feature_names = X.columns

plt.barh(feature_names, importance)
plt.xlabel("Feature Importance")
plt.title("Feature Importance in Predicting IMDb Score")
plt.yticks(fontsize=6)
plt.figure(figsize=(8, 10))  
plt.tight_layout()
plt.show()

print(feature_names)


In [None]:
# Actual vs. Predicted (Training vs. Testing)

# This graph compares actual values (y) to predicted values (y_pred) for both training and testing data. 
# It shows how well the model performs on the data it has seen (training) versus unseen data (testing).
    
plt.scatter(y_train, y_train_pred, label="Train Data", alpha=0.7)
plt.scatter(y_test, y_test_pred, label="Test Data", alpha=0.7)
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], color='red', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted (Training & Testing)")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Residual Plot
# Residuals are the differences between actual values and predicted values. 
# A residual plot helps identify patterns and potential issues in the model.

train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred

plt.figure(figsize=(10, 5))

# Training residuals
plt.scatter(y_train_pred, train_residuals, label="Train Residuals", alpha=0.7)
# Testing residuals
plt.scatter(y_test_pred, test_residuals, label="Test Residuals", alpha=0.7)

plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot (Training & Testing)")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Error Distribution (Training vs. Testing)
# The distribution of prediction errors (residuals) can indicate whether the model is biased or has issues with variance.

# Plot Error Distribution
plt.figure(figsize=(10, 5))

# Training residuals
plt.hist(train_residuals, bins=15, alpha=0.7, label="Train Residuals")
# Testing residuals
plt.hist(test_residuals, bins=15, alpha=0.7, label="Test Residuals")

plt.xlabel("Prediction Error (Residuals)")
plt.ylabel("Frequency")
plt.title("Error Distribution (Training & Testing)")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
input = {
    'num_critic_for_reviews' : [1000],
    'num_critic_for_reviews' : [2220],
    'duration' : [120],
    'director_facebook_likes' : [234],
    'actor_3_facebook_likes' : [324243], 
    'actor_1_facebook_likes' : [2000],
    'gross' : [21341234],
    'num_voted_users' : [0],
    'cast_total_facebook_likes' : [1234],
    'facenumber_in_poster' : [0],
    'num_user_for_reviews' : [0],
    'budget' : [1234],
    'title_year' : [2000],
    'actor_2_facebook_likes' : [0], 
    'aspect_ratio' : [0], 
    'movie_facebook_likes' : [100000],
    'Action' : [1],
    'Adventure' : [0],
    'Animation' : [0],
    'Biography' : [0], 
    'Comedy' : [0],
    'Crime' : [0],
    'Documentary' : [0], 
    'Drama' : [0],
    'Family' : [1],
    'Fantasy' : [1],
    'Film-Noir' : [0], 
    'History' : [0],
    'Horror' : [0],
    'Music' : [0], 
    'Musical' : [0],
    'Mystery' : [0],
    'News' : [0],
    'Romance' : [0], 
    'Sci-Fi' : [0],
    'Short' : [0], 
    'Sport' : [0],
    'Thriller' : [0],
    'War' : [0],
    'Western' : [0],
    'Approved' : [0],
    'G' : [0],
    'GP' : [0],
    'M' : [0],
    'NC-17' : [0], 
    'Not Rated' : [0],
    'PG' : [0],
    'PG-13' : [0],
    'Passed' : [0],
    'R' : [0],
    'TV-14' : [0],
    'TV-G' : [0],
    'TV-PG' : [1],
    'Unrated' : [0],
    'X' : [0],
    'director_experience' : [50]
    }


single = pd.DataFrame(input)
StandardScaler().fit_transform(single)
single = scaler.transform(single)
prediction = model.predict(single)
print(f"Predicted IMDb Score: {prediction[0]}")

In [None]:
genre_dummies.columns.tolist()