In [83]:
from support.df_init import init

train = init()
test = init('pp_test.csv')

In [84]:
feats = [
    'startYear', 'totalCredits',
    'numRegions', 'rating',
    'countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF',
    'countryOfOrigin_AS', 'countryOfOrigin_EU', 'countryOfOrigin_OC', 'countryOfOrigin_SA',
    'countryOfOrigin_UNK', 'fill_runtimeMinutes_Bruno', 'totalNominations',
    'totalMedia', 'titleType_movie', 'titleType_short',
    'titleType_tvEpisode', 'titleType_tvMiniSeries', 'titleType_tvMovie',
    'titleType_tvSeries', 'titleType_tvShort', 'titleType_tvSpecial',
    'titleType_video', 'numVotes',
    # 'is_Documentary', 'is_History', 'is_Adventure', 'is_Thriller',
    # 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi', 'is_Romance', 'is_Biography',
    # 'is_Musical', 'is_Western', 'is_Music', 'is_Film-Noir', 'is_Adult',
    # 'is_Reality-TV', 'is_News', 'is_Action', 'is_Crime', 'is_Short',
    # 'is_Fantasy', 'is_Family', 'is_Mystery', 'is_Talk-Show', 'is_Drama',
    # 'is_Sport', 'is_War', 'is_Horror', 'is_Animation',
    # 'runtimeMinutes_notitletype', 'awardNominationsExcludeWins',
    # 'awardWins', 'fill_runtimeMinutes', 'totalImages', 'totalVideos',
    # 'is_from_Oceania', 'is_from_North America', 'is_from_South America',
    # 'is_from_Asia', 'is_from_Africa', 'is_from_Europe',
]

targets = ['userReviewsTotal', 'criticReviewsTotal']

In [85]:
import pandas as pd

train = pd.get_dummies(train, columns=['titleType'], prefix='titleType')
test = pd.get_dummies(test, columns=['titleType'], prefix='titleType')

In [86]:
from support.transformations import apply_transformations

X_train = train[feats]
y_train = train[targets]

X_test = test[feats]
y_test = test[targets]

X_train, X_test = apply_transformations(
    X_train, X_test
)

In [87]:
from sklearn.preprocessing import StandardScaler

# Scale X_train and X_test
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Scale y_train and y_test in a reversible way
y_train_scaled = pd.DataFrame()
y_test_scaled = pd.DataFrame()
target_scaler = StandardScaler()
y_train_scaled[targets] = target_scaler.fit_transform(y_train)
y_test_scaled[targets] = target_scaler.transform(y_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

def train_and_evaluate_model(ModelClass, X_train, y_train, X_test, y_test, params_distribution=None, **model_params):
    """
    Trains a model and evaluates it on train and test data.

    Parameters:
    - ModelClass: The class of the model to be trained (e.g., sklearn.linear_model.LinearRegression).
    - train: Training DataFrame.
    - test: Testing DataFrame.
    - feats: List of feature column names.
    - targets: List of target column names.
    - model_params: Additional parameters to initialize the model.

    Returns:
    - model: Trained model.
    - metrics: Dictionary containing MAE, MSE, and R^2 for train and test.
    """
    if params_distribution:
        # If a distribution is provided, run random search
        model = RandomizedSearchCV(
            ModelClass(**model_params),
            param_distributions=params_distribution,
            n_iter=50,
            cv=5,
            verbose=1,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        # Get the best model
        model = model.best_estimator_
    else:
        model = ModelClass(**model_params)
        model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate metrics
    metrics = {
        'train': {
            'per_feature': {
                feature: {
                    'mae': mean_absolute_error(y_train.iloc[:, i], y_train_pred[:, i]) / (np.max(y_train.iloc[:, i]) - np.min(y_train.iloc[:, i])),
                    'mse': mean_squared_error(y_train.iloc[:, i], y_train_pred[:, i]) / (np.max(y_train.iloc[:, i]) - np.min(y_train.iloc[:, i])),
                    'r2': r2_score(y_train.iloc[:, i], y_train_pred[:, i])
                } for i, feature in enumerate(targets)
            },
            'mae': mean_absolute_error(y_train, y_train_pred),
            'mse': mean_squared_error(y_train, y_train_pred),
            'r2': r2_score(y_train, y_train_pred)
        },
        'test': {
            'per_feature': {
                feature: {
                    'mae': mean_absolute_error(y_test.iloc[:, i], y_test_pred[:, i]),
                    'mse': mean_squared_error(y_test.iloc[:, i], y_test_pred[:, i]),
                    'r2': r2_score(y_test.iloc[:, i], y_test_pred[:, i])
                } for i, feature in enumerate(targets)
            },
            'mae': mean_absolute_error(y_test, y_test_pred),
            'mse': mean_squared_error(y_test, y_test_pred),
            'r2': r2_score(y_test, y_test_pred)
        }
    }

    return model, metrics

# Linear regressor

In [89]:
from sklearn.linear_model import LinearRegression

# Train and evaluate the linear regressor
model, metrics = train_and_evaluate_model(
    ModelClass=LinearRegression,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

def print_model_summary(metrics):
    print("Train Metrics:")
    for feature, metrics_dict in metrics['train']['per_feature'].items():
        print(f"Feature: {feature}")
        print('  Train MAE: %.3f' % metrics_dict['mae'])
        print('  Train MSE: %.3f' % metrics_dict['mse'])
        print('  Train R-squared: %.3f' % metrics_dict['r2'])

    print('MAE: %.3f' % metrics['train']['mae'])
    print('MSE: %.3f' % metrics['train']['mse'])
    print('R-squared: %.3f' % metrics['train']['r2'])

    print("\nTest Metrics:")
    for feature, metrics_dict in metrics['test']['per_feature'].items():
        print(f"Feature: {feature}")
        print('  Train MAE: %.3f' % metrics_dict['mae'])
        print('  Train MSE: %.3f' % metrics_dict['mse'])
        print('  Train R-Squared: %.3f' % metrics_dict['r2'])

    print('MAE: %.3f' % metrics['test']['mae'])
    print('MSE: %.3f' % metrics['test']['mse'])
    print('R-squared: %.3f' % metrics['test']['r2'])
    
print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 7.563
  Train MSE: 1861.366
  Train R-squared: 0.586
Feature: criticReviewsTotal
  Train MAE: 3.733
  Train MSE: 104.540
  Train R-squared: 0.566
MAE: 5.648
MSE: 982.953
R-squared: 0.576

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 7.446
  Train MSE: 778.798
  Train R-Squared: 0.797
Feature: criticReviewsTotal
  Train MAE: 3.958
  Train MSE: 143.860
  Train R-Squared: 0.585
MAE: 5.702
MSE: 461.329
R-squared: 0.691


In [90]:
# Train and evaluate the linear regressor
model, metrics = train_and_evaluate_model(
    ModelClass=LinearRegression,
    X_train=X_train,
    y_train=y_train_scaled,
    X_test=X_test,
    y_test=y_test_scaled
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.113
  Train MSE: 0.414
  Train R-squared: 0.586
Feature: criticReviewsTotal
  Train MAE: 0.240
  Train MSE: 0.434
  Train R-squared: 0.566
MAE: 0.177
MSE: 0.424
R-squared: 0.576

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.111
  Train MSE: 0.173
  Train R-Squared: 0.797
Feature: criticReviewsTotal
  Train MAE: 0.255
  Train MSE: 0.597
  Train R-Squared: 0.585
MAE: 0.183
MSE: 0.385
R-squared: 0.691


# Ridge

In [91]:
from sklearn.linear_model import Ridge

# Train and evaluate the ridge regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Ridge,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 7.564
  Train MSE: 1861.366
  Train R-squared: 0.586
Feature: criticReviewsTotal
  Train MAE: 3.732
  Train MSE: 104.540
  Train R-squared: 0.566
MAE: 5.648
MSE: 982.953
R-squared: 0.576

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 7.446
  Train MSE: 778.786
  Train R-Squared: 0.797
Feature: criticReviewsTotal
  Train MAE: 3.958
  Train MSE: 143.862
  Train R-Squared: 0.585
MAE: 5.702
MSE: 461.324
R-squared: 0.691


In [92]:
# Train and evaluate the linear regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Ridge,
    X_train=X_train,
    y_train=y_train_scaled,
    X_test=X_test,
    y_test=y_test_scaled
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.113
  Train MSE: 0.414
  Train R-squared: 0.586
Feature: criticReviewsTotal
  Train MAE: 0.240
  Train MSE: 0.434
  Train R-squared: 0.566
MAE: 0.177
MSE: 0.424
R-squared: 0.576

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.111
  Train MSE: 0.173
  Train R-Squared: 0.797
Feature: criticReviewsTotal
  Train MAE: 0.255
  Train MSE: 0.597
  Train R-Squared: 0.585
MAE: 0.183
MSE: 0.385
R-squared: 0.691


# Lasso

In [93]:
from sklearn.linear_model import Lasso

# Train and evaluate the ridge regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Lasso,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 6.823
  Train MSE: 1867.485
  Train R-squared: 0.584
Feature: criticReviewsTotal
  Train MAE: 3.226
  Train MSE: 109.384
  Train R-squared: 0.546
MAE: 5.025
MSE: 988.434
R-squared: 0.565

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 6.693
  Train MSE: 779.596
  Train R-Squared: 0.796
Feature: criticReviewsTotal
  Train MAE: 3.460
  Train MSE: 153.819
  Train R-Squared: 0.557
MAE: 5.076
MSE: 466.707
R-squared: 0.676


In [94]:
# Train and evaluate the linear regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Lasso,
    X_train=X_train,
    y_train=y_train_scaled,
    X_test=X_test,
    y_test=y_test_scaled
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.172
  Train MSE: 1.000
  Train R-squared: 0.000
Feature: criticReviewsTotal
  Train MAE: 0.285
  Train MSE: 1.000
  Train R-squared: 0.000
MAE: 0.228
MSE: 1.000
R-squared: -0.000

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.175
  Train MSE: 0.852
  Train R-Squared: -0.000
Feature: criticReviewsTotal
  Train MAE: 0.305
  Train MSE: 1.440
  Train R-Squared: -0.000
MAE: 0.240
MSE: 1.146
R-squared: -0.000


# KNN

In [95]:
from sklearn.neighbors import KNeighborsRegressor


# Train and evaluate the KNN regressor
model, metrics = train_and_evaluate_model(
    ModelClass=KNeighborsRegressor,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 3.877
  Train MSE: 1471.776
  Train R-squared: 0.672
Feature: criticReviewsTotal
  Train MAE: 1.671
  Train MSE: 53.362
  Train R-squared: 0.779
MAE: 2.774
MSE: 762.569
R-squared: 0.725

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 4.594
  Train MSE: 1135.994
  Train R-Squared: 0.703
Feature: criticReviewsTotal
  Train MAE: 2.156
  Train MSE: 93.264
  Train R-Squared: 0.731
MAE: 3.375
MSE: 614.629
R-squared: 0.717


In [96]:
# Train and evaluate the linear regressor
model, metrics = train_and_evaluate_model(
    ModelClass=KNeighborsRegressor,
    X_train=X_train,
    y_train=y_train_scaled,
    X_test=X_test,
    y_test=y_test_scaled
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.058
  Train MSE: 0.328
  Train R-squared: 0.672
Feature: criticReviewsTotal
  Train MAE: 0.108
  Train MSE: 0.221
  Train R-squared: 0.779
MAE: 0.083
MSE: 0.275
R-squared: 0.725

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.069
  Train MSE: 0.253
  Train R-Squared: 0.703
Feature: criticReviewsTotal
  Train MAE: 0.139
  Train MSE: 0.387
  Train R-Squared: 0.731
MAE: 0.104
MSE: 0.320
R-squared: 0.717


# DT

In [97]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint, uniform

# Define parameter distribution for RandomizedSearchCV
param_distributions = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': [None, 'sqrt', 'log2'],
    # 'max_leaf_nodes': randint(2, 20),
    'ccp_alpha': uniform(0.0, 0.1),
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
}

model, metrics = train_and_evaluate_model(
    ModelClass=DecisionTreeRegressor,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    params_distribution=param_distributions
)
print_model_summary(metrics)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Train Metrics:
Feature: userReviewsTotal
  Train MAE: 3.357
  Train MSE: 1559.004
  Train R-squared: 0.653
Feature: criticReviewsTotal
  Train MAE: 1.679
  Train MSE: 57.232
  Train R-squared: 0.763
MAE: 2.518
MSE: 808.118
R-squared: 0.708

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 3.603
  Train MSE: 802.925
  Train R-Squared: 0.790
Feature: criticReviewsTotal
  Train MAE: 1.994
  Train MSE: 91.587
  Train R-Squared: 0.736
MAE: 2.799
MSE: 447.256
R-squared: 0.763


In [98]:
model, metrics = train_and_evaluate_model(
    ModelClass=DecisionTreeRegressor,
    X_train=X_train,
    y_train=y_train_scaled,
    X_test=X_test,
    y_test=y_test_scaled,
    params_distribution=param_distributions
)
print_model_summary(metrics)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.091
  Train MSE: 0.393
  Train R-squared: 0.607
Feature: criticReviewsTotal
  Train MAE: 0.188
  Train MSE: 0.368
  Train R-squared: 0.632
MAE: 0.140
MSE: 0.381
R-squared: 0.619

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.091
  Train MSE: 0.245
  Train R-Squared: 0.713
Feature: criticReviewsTotal
  Train MAE: 0.200
  Train MSE: 0.498
  Train R-Squared: 0.654
MAE: 0.145
MSE: 0.371
R-squared: 0.684


45 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 1377, in fit
    super()._fit(