In [26]:
from support.df_init import init

train = init()
test = init('pp_test.csv')

In [27]:
# numVotes                       0.795977       
# numRegions                     0.622081
# totalMedia                     0.583356
# totalCredits                   0.391903
# totalNominations               0.354339

# numVotes            0.708518
# numRegions          0.622410
# totalMedia          0.534560
# totalNominations    0.407327
# totalCredits        0.370132


feats = [
    'totalCredits',
    'numRegions', 'totalMedia', 'numVotes', 'awardWins',
    # 'countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF',
    # 'countryOfOrigin_AS', 'countryOfOrigin_EU', 'countryOfOrigin_OC', 'countryOfOrigin_SA',
    # 'countryOfOrigin_UNK', 'fill_runtimeMinutes_Bruno', 'totalNominations',
    # 'totalMedia', 'titleType_movie', 'titleType_short',
    # 'titleType_tvEpisode', 'titleType_tvMiniSeries', 'titleType_tvMovie',
    # 'titleType_tvSeries', 'titleType_tvShort', 'titleType_tvSpecial',
    # 'titleType_video', 
    # 'is_Documentary', 'is_History', 'is_Adventure', 'is_Thriller',
    # 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi', 'is_Romance', 'is_Biography',
    # 'is_Musical', 'is_Western', 'is_Music', 'is_Film-Noir', 'is_Adult',
    # 'is_Reality-TV', 'is_News', 'is_Action', 'is_Crime', 'is_Short',
    # 'is_Fantasy', 'is_Family', 'is_Mystery', 'is_Talk-Show', 'is_Drama',
    # 'is_Sport', 'is_War', 'is_Horror', 'is_Animation',
    # 'runtimeMinutes_notitletype', 'awardNominationsExcludeWins',
    # 'fill_runtimeMinutes', 'totalImages', 'totalVideos',
    # 'is_from_Oceania', 'is_from_North America', 'is_from_South America',
    # 'is_from_Asia', 'is_from_Africa', 'is_from_Europe',
]

targets = ['userReviewsTotal', 'criticReviewsTotal']

In [28]:
# Compute correlations for all numeric features with each target variable
numeric_feats = train.select_dtypes(include=['number', 'bool']).columns.difference(targets)

for target in targets:
    print(f"Correlations with {target}:")
    # Print top 6 most correlated features
    # print(corr.head(6))
    corr = train[numeric_feats].corrwith(train[target]).sort_values(ascending=False)
    print(corr.head(6))
    print()

Correlations with userReviewsTotal:
numVotes                       0.752135
numRegions                     0.482993
awardNominationsExcludeWins    0.421745
totalNominations               0.394651
totalCredits                   0.305904
totalVideos                    0.286466
dtype: float64

Correlations with criticReviewsTotal:
numVotes                       0.673614
numRegions                     0.658810
awardNominationsExcludeWins    0.507454
totalNominations               0.483278
awardWins                      0.356987
totalVideos                    0.334553
dtype: float64



In [29]:
import pandas as pd

train = pd.get_dummies(train, columns=['titleType'], prefix='titleType')
test = pd.get_dummies(test, columns=['titleType'], prefix='titleType')

In [30]:
from support.transformations import apply_transformations
from sklearn.preprocessing import StandardScaler

train, test = apply_transformations(train, test)

from sklearn.preprocessing import StandardScaler

# Scale features
scaler_X = StandardScaler()
train[feats] = scaler_X.fit_transform(train[feats])
X_train = train[feats]
test[feats] = scaler_X.transform(test[feats])
X_test = test[feats]

# Scale target
scaler_y = StandardScaler()
train[targets] = scaler_y.fit_transform(train[targets])
y_train = train[targets]
test[targets] = scaler_y.transform(test[targets])
y_test = test[targets]

In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

def train_and_evaluate_model(ModelClass, X_train, y_train, X_test, y_test, params_distribution=None, **model_params):
    """
    Trains a model and evaluates it on train and test data.

    Parameters:
    - ModelClass: The class of the model to be trained (e.g., sklearn.linear_model.LinearRegression).
    - train: Training DataFrame.
    - test: Testing DataFrame.
    - feats: List of feature column names.
    - targets: List of target column names.
    - model_params: Additional parameters to initialize the model.

    Returns:
    - model: Trained model.
    - metrics: Dictionary containing MAE, MSE, and R^2 for train and test.
    """
    if params_distribution:
        # If a distribution is provided, run random search
        model = RandomizedSearchCV(
            ModelClass(**model_params),
            param_distributions=params_distribution,
            n_iter=50,
            cv=5,
            verbose=1,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        # Get the best model
        model = model.best_estimator_
    else:
        model = ModelClass(**model_params)
        model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate metrics
    metrics = {
        'train': {
            'per_feature': {
                feature: {
                    'mae': mean_absolute_error(y_train.iloc[:, i], y_train_pred[:, i]),
                    'mse': mean_squared_error(y_train.iloc[:, i], y_train_pred[:, i]),
                    'r2': r2_score(y_train.iloc[:, i], y_train_pred[:, i])
                } for i, feature in enumerate(targets)
            },
            'mae': mean_absolute_error(y_train, y_train_pred),
            'mse': mean_squared_error(y_train, y_train_pred),
            'r2': r2_score(y_train, y_train_pred)
        },
        'test': {
            'per_feature': {
                feature: {
                    'mae': mean_absolute_error(y_test.iloc[:, i], y_test_pred[:, i]),
                    'mse': mean_squared_error(y_test.iloc[:, i], y_test_pred[:, i]),
                    'r2': r2_score(y_test.iloc[:, i], y_test_pred[:, i])
                } for i, feature in enumerate(targets)
            },
            'mae': mean_absolute_error(y_test, y_test_pred),
            'mse': mean_squared_error(y_test, y_test_pred),
            'r2': r2_score(y_test, y_test_pred)
        }
    }

    return model, metrics

# Linear regressor

In [32]:
from sklearn.linear_model import LinearRegression

# Train and evaluate the linear regressor
model, metrics = train_and_evaluate_model(
    ModelClass=LinearRegression,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

def print_model_summary(metrics):
    print("Train Metrics:")
    for feature, metrics_dict in metrics['train']['per_feature'].items():
        print(f"Feature: {feature}")
        print('  Train MAE: %.3f' % metrics_dict['mae'])
        print('  Train MSE: %.3f' % metrics_dict['mse'])
        print('  Train R-squared: %.3f' % metrics_dict['r2'])

    print('MAE: %.3f' % metrics['train']['mae'])
    print('MSE: %.3f' % metrics['train']['mse'])
    print('R-squared: %.3f' % metrics['train']['r2'])

    print("\nTest Metrics:")
    for feature, metrics_dict in metrics['test']['per_feature'].items():
        print(f"Feature: {feature}")
        print('  Train MAE: %.3f' % metrics_dict['mae'])
        print('  Train MSE: %.3f' % metrics_dict['mse'])
        print('  Train R-Squared: %.3f' % metrics_dict['r2'])

    print('MAE: %.3f' % metrics['test']['mae'])
    print('MSE: %.3f' % metrics['test']['mse'])
    print('R-squared: %.3f' % metrics['test']['r2'])
    
print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.425
  Train MSE: 0.316
  Train R-squared: 0.684
Feature: criticReviewsTotal
  Train MAE: 0.461
  Train MSE: 0.418
  Train R-squared: 0.582
MAE: 0.443
MSE: 0.367
R-squared: 0.633

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.416
  Train MSE: 0.306
  Train R-Squared: 0.692
Feature: criticReviewsTotal
  Train MAE: 0.469
  Train MSE: 0.439
  Train R-Squared: 0.581
MAE: 0.443
MSE: 0.372
R-squared: 0.637


# Ridge

In [33]:
from sklearn.linear_model import Ridge

# Train and evaluate the ridge regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Ridge,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.425
  Train MSE: 0.316
  Train R-squared: 0.684
Feature: criticReviewsTotal
  Train MAE: 0.461
  Train MSE: 0.418
  Train R-squared: 0.582
MAE: 0.443
MSE: 0.367
R-squared: 0.633

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.416
  Train MSE: 0.306
  Train R-Squared: 0.692
Feature: criticReviewsTotal
  Train MAE: 0.469
  Train MSE: 0.439
  Train R-Squared: 0.581
MAE: 0.442
MSE: 0.372
R-squared: 0.637


# Lasso

In [34]:
from sklearn.linear_model import Lasso

# Train and evaluate the ridge regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Lasso,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.733
  Train MSE: 1.000
  Train R-squared: 0.000
Feature: criticReviewsTotal
  Train MAE: 0.721
  Train MSE: 1.000
  Train R-squared: 0.000
MAE: 0.727
MSE: 1.000
R-squared: 0.000

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.730
  Train MSE: 0.993
  Train R-Squared: -0.000
Feature: criticReviewsTotal
  Train MAE: 0.731
  Train MSE: 1.047
  Train R-Squared: -0.000
MAE: 0.730
MSE: 1.020
R-squared: -0.000


# KNN

In [35]:
from sklearn.neighbors import KNeighborsRegressor


# Train and evaluate the KNN regressor
model, metrics = train_and_evaluate_model(
    ModelClass=KNeighborsRegressor,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.318
  Train MSE: 0.211
  Train R-squared: 0.789
Feature: criticReviewsTotal
  Train MAE: 0.317
  Train MSE: 0.266
  Train R-squared: 0.734
MAE: 0.317
MSE: 0.238
R-squared: 0.762

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.377
  Train MSE: 0.292
  Train R-Squared: 0.706
Feature: criticReviewsTotal
  Train MAE: 0.391
  Train MSE: 0.408
  Train R-Squared: 0.610
MAE: 0.384
MSE: 0.350
R-squared: 0.658


# DT

In [36]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint, uniform

# Define parameter distribution for RandomizedSearchCV
param_distributions = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': [None, 'sqrt', 'log2'],
    # 'max_leaf_nodes': randint(2, 20),
    'ccp_alpha': uniform(0.0, 0.1),
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
}

model, metrics = train_and_evaluate_model(
    ModelClass=DecisionTreeRegressor,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    params_distribution=param_distributions
)
print_model_summary(metrics)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Train Metrics:
Feature: userReviewsTotal
  Train MAE: 0.396
  Train MSE: 0.284
  Train R-squared: 0.716
Feature: criticReviewsTotal
  Train MAE: 0.397
  Train MSE: 0.364
  Train R-squared: 0.636
MAE: 0.397
MSE: 0.324
R-squared: 0.676

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 0.391
  Train MSE: 0.280
  Train R-Squared: 0.718
Feature: criticReviewsTotal
  Train MAE: 0.404
  Train MSE: 0.383
  Train R-Squared: 0.635
MAE: 0.398
MSE: 0.331
R-squared: 0.676


30 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 1377, in fit
    super()._fit(