In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import utils.pipeline_util as pipe
import utils.transformer_util as tu
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Load data from CSV into a DataFrame
imdb_df = pd.read_csv('../resources/movie_metadata.csv')
imdb_df.head()

In [None]:
imdb_df.info()
imdb_df.describe()

In [None]:
# Create a data frame with numeric columns
imdb_df_numeric = imdb_df.select_dtypes(include='number')

# Find correlation between numeric columns with imdb_score
correlation = imdb_df_numeric.corr()['imdb_score'].sort_values(ascending=False)
correlation

In [None]:
# function to plot model scores
def plot_model_scores(data: pd.DataFrame, title='Model Comparison'):
    models = {
            "Linear Regression": LinearRegression(),
            "Lasso Regression": Lasso(),
            "Ridge Regression": Ridge(),
            "Support Vector Machine": SVR(),
            "Random Forest": RandomForestRegressor(n_estimators=128),
            "Gradient Boosting": GradientBoostingRegressor(n_estimators=128)
        }

    X = data.drop(columns=['imdb_score'])
    y = data['imdb_score']

    scores = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   # 80% training and 20% test

    for keys, model in models.items():
        model.fit(X_train, y_train)
        y_pred, metrics = pipe.check_metrics(X_test, y_test, model)
        metrics['model'] = keys
        scores.append(metrics)
        scores_df = pd.DataFrame(scores)
    
    display(scores_df)
    # Plot Line chart for the models with all columns in the DataFrame
    ax = scores_df.plot(
        x='model',
        y=['Mean Squared Error', 'Adjusted R-squared'],
        kind='line',
        figsize=(10, 6),
        title=title,
        rot=45,
        grid=True,
        legend=True,
        marker='o'
    )

    # Add annotations for each data point
    for line in ax.get_lines():
        y_data = line.get_ydata()  # Get y-data for the line
        x_data = line.get_xdata()  # Get x-data for the line
        for x, y in zip(x_data, y_data):
            ax.annotate(f'{y:.2f}', xy=(x, y), xytext=(0, 5), textcoords='offset points', ha='center', va='bottom')

    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Check for missing values
missing_values = imdb_df_numeric.isnull().sum().sort_values(ascending=False)
missing_values

In [None]:
# Drop rows with missing values and check the shape of the data frame
imdb_df_numeric = imdb_df_numeric.dropna()
imdb_df_numeric.shape


In [None]:
# Plot the model scores using imdb_df_numeric
plot_model_scores(imdb_df_numeric, title='Model Comparison with Numeric Columns (No Scaling/Encoding)')

In [None]:
# run preprocessed data through pipeline
# Plot the model scores using preprocessed_df
model, best_y_pred, preprocessed_df = pipe.run_pipeline(data=imdb_df, use_PCA=False, debug=False)
preprocessed_df = tu.encode_data(preprocessed_df)
plot_model_scores(preprocessed_df, title='Model Comparison with Preprocessed Data')

In [None]:
# Prepare data for PCA
X = imdb_df_numeric.drop(columns=['imdb_score'])
y = imdb_df_numeric['imdb_score']

# create train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create PCA object with 3 components and check the explained variance
pca = PCA(n_components=3)

# Fit PCA on training data
pca.fit(X_train)

# Transform training and test data
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Create a DataFrame with PCA data
X_train_pca_df = pd.DataFrame(X_train_pca, columns=['PCA1', 'PCA2', 'PCA3'])
X_test_pca_df = pd.DataFrame(X_test_pca, columns=['PCA1', 'PCA2', 'PCA3'])

# Check the explained variance
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
# Check correlation between PCA components and imdb_score
X_train_pca_df['imdb_score'] = y_train.values # Add imdb_score to PCA data frame
X_test_pca_df['imdb_score'] = y_test.values # Add imdb_score to PCA data frame

correlation_pca = X_train_pca_df.corr()['imdb_score'].sort_values(ascending=False)
correlation_pca

In [None]:
# Plot a chart showing correlation between PCA components with imdb_score not including the imdb_score column
correlation_pca.drop('imdb_score').plot(kind='bar', figsize=(10, 6), title='Correlation between PCA components and imdb_score')

In [None]:
# Check PCA components weights for each feature
pca_components = pd.DataFrame(pca.components_, columns=X.columns)
pca_components

In [None]:
plot_model_scores(X_train_pca_df, title='Model Comparison with PCA Data')