In [67]:
from support.df_init import init

train = init()
test = init('pp_test.csv')

In [68]:
from support.transformations import apply_transformations
train, test = apply_transformations(train, test)

In [69]:
# Get list of numerical features
numerical_feats = train.select_dtypes(include=['number']).columns
print("Numerical features:", list(numerical_feats))

# Calculate correlations with 'criticReviewsTotal' and sort
correlations = train[numerical_feats].corr()['criticReviewsTotal'].drop('criticReviewsTotal').sort_values(ascending=False)
correlations

Numerical features: ['rating', 'startYear', 'endYear', 'runtimeMinutes', 'awardWins', 'numVotes', 'totalImages', 'totalVideos', 'totalCredits', 'criticReviewsTotal', 'awardNominationsExcludeWins', 'numRegions', 'userReviewsTotal', 'is_Documentary', 'is_History', 'is_Adventure', 'is_Thriller', 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi', 'is_Romance', 'is_Biography', 'is_Musical', 'is_Western', 'is_Music', 'is_Film-Noir', 'is_Adult', 'is_Reality-TV', 'is_News', 'is_Action', 'is_Crime', 'is_Short', 'is_Fantasy', 'is_Family', 'is_Mystery', 'is_Talk-Show', 'is_Drama', 'is_Sport', 'is_War', 'is_Horror', 'is_Animation', 'is_from_Oceania', 'is_from_North America', 'is_from_South America', 'is_from_Asia', 'is_from_Africa', 'is_from_Europe', 'countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF', 'countryOfOrigin_AS', 'countryOfOrigin_EU', 'countryOfOrigin_OC', 'countryOfOrigin_SA', 'countryOfOrigin_UNK', 'fill_runtimeMinutes_Bruno', 'totalNominations', 'totalMedia', 'runtimeMinute

userReviewsTotal               0.737432
numVotes                       0.708518
numRegions                     0.622410
totalMedia                     0.534560
totalNominations               0.407327
totalCredits                   0.370132
awardNominationsExcludeWins    0.309070
awardWins                      0.248336
fill_runtimeMinutes_Bruno      0.232030
runtimeMinutes                 0.223579
runtimeMinutes_notitletype     0.211942
totalImages                    0.211628
totalVideos                    0.189170
is_Horror                      0.183451
is_Thriller                    0.156148
countryOfOrigin_freq_enc       0.133397
is_Drama                       0.131762
countryOfOrigin_NA             0.116115
is_from_North America          0.109537
is_Action                      0.101086
is_Film-Noir                   0.097147
is_Sci-Fi                      0.093616
is_Crime                       0.089031
is_Mystery                     0.078783
is_Fantasy                     0.059501


In [70]:
train = train.loc[~train['outlier_w_type']]
test = test.loc[~test['outlier_w_type']]

In [71]:
import pandas as pd

train = pd.get_dummies(train, columns=['titleType'], prefix='titleType')
test = pd.get_dummies(test, columns=['titleType'], prefix='titleType')

In [72]:
train.columns

Index(['originalTitle', 'rating', 'startYear', 'endYear', 'runtimeMinutes',
       'awardWins', 'numVotes', 'totalImages', 'totalVideos', 'totalCredits',
       'criticReviewsTotal', 'awardNominationsExcludeWins', 'canHaveEpisodes',
       'numRegions', 'userReviewsTotal', 'is_Documentary', 'is_History',
       'is_Adventure', 'is_Thriller', 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi',
       'is_Romance', 'is_Biography', 'is_Musical', 'is_Western', 'is_Music',
       'is_Film-Noir', 'is_Adult', 'is_Reality-TV', 'is_News', 'is_Action',
       'is_Crime', 'is_Short', 'is_Fantasy', 'is_Family', 'is_Mystery',
       'is_Talk-Show', 'is_Drama', 'is_Sport', 'is_War', 'is_Horror',
       'is_Animation', 'is_from_Oceania', 'is_from_North America',
       'is_from_South America', 'is_from_Asia', 'is_from_Africa',
       'is_from_Europe', 'countryOfOrigin', 'countryOfOrigin_freq_enc',
       'countryOfOrigin_NA', 'countryOfOrigin_AF', 'countryOfOrigin_AS',
       'countryOfOrigin_EU', 'countryOfOr

In [73]:
# userReviewsTotal               0.732326
# numVotes                       0.698266
# numRegions                     0.609967
# totalMedia                     0.599626

feats = [
    'userReviewsTotal',
    'numVotes',
    'numRegions',
    'totalMedia',
]

In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train[feats + ['criticReviewsTotal']] = scaler.fit_transform(train[feats + ['criticReviewsTotal']])
test[feats + ['criticReviewsTotal']] = scaler.transform(test[feats + ['criticReviewsTotal']])

In [75]:
from support.transformations import to_log, apply_transformations

y_train = train['criticReviewsTotal']

y_test = test['criticReviewsTotal']

# X_train, X_test = apply_transformations(
#     train, test
# )

X_train = train[feats]
X_test = test[feats]

In [76]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

def train_regression_model(
    model_class,
    X_train, y_train,
    X_test, y_test,
    param_distributions=None,
    **model_params):
    """
    Trains a regression model and evaluates its performance.

    Parameters:
        model_class: The regression model class to be trained (e.g., sklearn.linear_model.LinearRegression).
        X_train: Training feature DataFrame.
        y_train: Training target Series.
        X_test: Testing feature DataFrame.
        y_test: Testing target Series.
        model_params: Additional parameters to pass to the model.

    Returns:
        A dictionary containing relative MAE, MSE, and R-squared for train and test sets.
    """
    # Initialize and train the model with additional parameters
    if param_distributions:
        model = RandomizedSearchCV(
            estimator=model_class(),
            param_distributions=param_distributions,
            n_iter=100,
            cv=5,
            random_state=42,
            n_jobs=-1,
            scoring='neg_mean_squared_error',
        )
        model.fit(X_train, y_train)
        model = model.best_estimator_
    else:
        model = model_class(**model_params)
        model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'train': {
            'mae': mean_absolute_error(y_train, y_train_pred),
            'mse': mean_squared_error(y_train, y_train_pred),
            'r2': r2_score(y_train, y_train_pred)
        },
        'test': {
            'mae': mean_absolute_error(y_test, y_test_pred),
            'mse': mean_squared_error(y_test, y_test_pred),
            'r2': r2_score(y_test, y_test_pred)
        }
    }
    
    return metrics, model

# Linear regressor

In [77]:
from sklearn.linear_model import LinearRegression

metrics, model = train_regression_model(
    LinearRegression,
    X_train, y_train,
    X_test, y_test
)
print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")

Train Metrics:
MAE: 0.4293261337134901
MSE: 0.3901020059139119
R-squared: 0.609897994086088

Test Metrics:
MAE: 0.43618482817233767
MSE: 0.4047596026032699
R-squared: 0.6283215918237665


In [78]:
model.coef_

array([0.36345773, 0.25996899, 0.22069283, 0.05235428])

# Ridge

In [79]:
from sklearn.linear_model import Ridge

metrics, ridge_model = train_regression_model(
    Ridge,
    X_train, y_train,
    X_test, y_test,
    param_distributions={'alpha': np.logspace(-4, 4, 100)},
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
ridge_model

Train Metrics:
MAE: 0.42931631217711175
MSE: 0.3901030768895483
R-squared: 0.6098969231104515

Test Metrics:
MAE: 0.43620274461198116
MSE: 0.4048222139809378
R-squared: 0.6282640977037118


# Lasso

In [80]:
from sklearn.linear_model import Lasso

metrics, lasso_model = train_regression_model(
    Lasso,
    X_train, y_train,
    X_test, y_test,
    param_distributions={
        'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
    }
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
lasso_model



Train Metrics:
MAE: 0.42920679803709977
MSE: 0.4045856282742453
R-squared: 0.5954143717257545

Test Metrics:
MAE: 0.4378859237965068
MSE: 0.4265639602963446
R-squared: 0.6082993146336917


# KNN

In [81]:
from sklearn.neighbors import KNeighborsRegressor

metrics, knn_model = train_regression_model(
    KNeighborsRegressor,
    X_train[feats], y_train,
    X_test[feats], y_test,
    param_distributions={
        'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
knn_model



Train Metrics:
MAE: 0.3572697000235424
MSE: 0.312744389043866
R-squared: 0.6872556109561339

Test Metrics:
MAE: 0.3829448485792352
MSE: 0.3623199465251996
R-squared: 0.6672926346679432


# DT

In [82]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint, uniform

# Define parameter distribution for RandomizedSearchCV
param_distributions = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    # 'max_features': [None, 'sqrt', 'log2'],
    # 'max_leaf_nodes': randint(2, 20),
    'ccp_alpha': uniform(0.0, 0.1),
    'criterion': ['friedman_mse'],
}

# Train the Decision Tree model
dt_metrics, dt_model = train_regression_model(
    DecisionTreeRegressor,
    X_train[feats], y_train,
    X_test[feats], y_test,
    param_distributions=param_distributions
)

# Print metrics
print("Train Metrics:")
print(f"MAE: {dt_metrics['train']['mae']}")
print(f"MSE: {dt_metrics['train']['mse']}")
print(f"R-squared: {dt_metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {dt_metrics['test']['mae']}")
print(f"MSE: {dt_metrics['test']['mse']}")
print(f"R-squared: {dt_metrics['test']['r2']}")
dt_model

Train Metrics:
MAE: 0.3842232272209877
MSE: 0.3411879108303875
R-squared: 0.6588120891696123

Test Metrics:
MAE: 0.3948828878470653
MSE: 0.37093561054100205
R-squared: 0.6593811329613032


In [83]:
dt_model.feature_importances_

array([0.75062641, 0.17635008, 0.06922133, 0.00380218])