In [41]:
from support.df_init import init

train = init()
test = init('pp_test.csv')

In [42]:
# numVotes                       0.795977       
# numRegions                     0.622081
# totalMedia                     0.583356
# totalCredits                   0.391903
# totalNominations               0.354339

# numVotes            0.708518
# numRegions          0.622410
# totalMedia          0.534560
# totalNominations    0.407327
# totalCredits        0.370132


feats = [
    'numVotes',
    'numRegions',
    'totalMedia',
    # 'totalNominations',
    # 'totalCredits',
    # 'countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF',
    # 'countryOfOrigin_AS', 'countryOfOrigin_EU', 'countryOfOrigin_OC', 'countryOfOrigin_SA',
    # 'countryOfOrigin_UNK', 'fill_runtimeMinutes_Bruno', 'totalNominations',
    # 'totalMedia', 'titleType_movie', 'titleType_short',
    # 'titleType_tvEpisode', 'titleType_tvMiniSeries', 'titleType_tvMovie',
    # 'titleType_tvSeries', 'titleType_tvShort', 'titleType_tvSpecial',
    # 'titleType_video', 
    # 'is_Documentary', 'is_History', 'is_Adventure', 'is_Thriller',
    # 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi', 'is_Romance', 'is_Biography',
    # 'is_Musical', 'is_Western', 'is_Music', 'is_Film-Noir', 'is_Adult',
    # 'is_Reality-TV', 'is_News', 'is_Action', 'is_Crime', 'is_Short',
    # 'is_Fantasy', 'is_Family', 'is_Mystery', 'is_Talk-Show', 'is_Drama',
    # 'is_Sport', 'is_War', 'is_Horror', 'is_Animation',
    # 'runtimeMinutes_notitletype', 'awardNominationsExcludeWins',
    # 'fill_runtimeMinutes', 'totalImages', 'totalVideos',
    # 'is_from_Oceania', 'is_from_North America', 'is_from_South America',
    # 'is_from_Asia', 'is_from_Africa', 'is_from_Europe',
]

targets = ['userReviewsTotal', 'criticReviewsTotal']

In [43]:
# Compute correlations for all numeric features with each target variable
numeric_feats = train.select_dtypes(include=['number', 'bool']).columns.difference(targets)

for target in targets:
    print(f"Correlations with {target}:")
    # Print top 6 most correlated features
    # print(corr.head(6))
    corr = train[numeric_feats].corrwith(train[target]).sort_values(ascending=False)
    print(corr.head(6))
    print()

Correlations with userReviewsTotal:
numVotes                       0.752135
numRegions                     0.482993
awardNominationsExcludeWins    0.421745
totalNominations               0.394651
totalCredits                   0.305904
totalVideos                    0.286466
dtype: float64

Correlations with criticReviewsTotal:
numVotes                       0.673614
numRegions                     0.658810
awardNominationsExcludeWins    0.507454
totalNominations               0.483278
awardWins                      0.356987
totalVideos                    0.334553
dtype: float64



In [44]:
import pandas as pd

train = pd.get_dummies(train, columns=['titleType'], prefix='titleType')
test = pd.get_dummies(test, columns=['titleType'], prefix='titleType')

In [45]:
from support.transformations import apply_transformations
from sklearn.preprocessing import StandardScaler

y_train = train[targets]
y_test = test[targets]

# train, test = apply_transformations(train, test)

from sklearn.preprocessing import StandardScaler

# Scale features
scaler_X = StandardScaler()
train[feats] = scaler_X.fit_transform(train[feats])
X_train = train[feats]
test[feats] = scaler_X.transform(test[feats])
X_test = test[feats]

# # Scale target
# scaler_y = StandardScaler()
# train[targets] = scaler_y.fit_transform(train[targets])
# y_train = train[targets]
# test[targets] = scaler_y.transform(test[targets])
# y_test = test[targets]
# y_test = test[targets]

In [46]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

def train_and_evaluate_model(ModelClass, X_train, y_train, X_test, y_test, params_distribution=None, **model_params):
    """
    Trains a model and evaluates it on train and test data.

    Parameters:
    - ModelClass: The class of the model to be trained (e.g., sklearn.linear_model.LinearRegression).
    - train: Training DataFrame.
    - test: Testing DataFrame.
    - feats: List of feature column names.
    - targets: List of target column names.
    - model_params: Additional parameters to initialize the model.

    Returns:
    - model: Trained model.
    - metrics: Dictionary containing MAE, MSE, and R^2 for train and test.
    """
    if params_distribution:
        # If a distribution is provided, run random search
        model = RandomizedSearchCV(
            ModelClass(**model_params),
            param_distributions=params_distribution,
            n_iter=50,
            cv=5,
            verbose=1,
            n_jobs=-1,
            scoring='neg_mean_absolute_error',
        )
        model.fit(X_train, y_train)
        # Get the best model
        model = model.best_estimator_
    else:
        model = ModelClass(**model_params)
        model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate metrics
    metrics = {
        'train': {
            'per_feature': {
                feature: {
                    'mae': mean_absolute_error(y_train.iloc[:, i], y_train_pred[:, i]),
                    'mse': mean_squared_error(y_train.iloc[:, i], y_train_pred[:, i]),
                    'r2': r2_score(y_train.iloc[:, i], y_train_pred[:, i])
                } for i, feature in enumerate(targets)
            },
            'mae': mean_absolute_error(y_train, y_train_pred),
            'mse': mean_squared_error(y_train, y_train_pred),
            'r2': r2_score(y_train, y_train_pred)
        },
        'test': {
            'per_feature': {
                feature: {
                    'mae': mean_absolute_error(y_test.iloc[:, i], y_test_pred[:, i]),
                    'mse': mean_squared_error(y_test.iloc[:, i], y_test_pred[:, i]),
                    'r2': r2_score(y_test.iloc[:, i], y_test_pred[:, i])
                } for i, feature in enumerate(targets)
            },
            'mae': mean_absolute_error(y_test, y_test_pred),
            'mse': mean_squared_error(y_test, y_test_pred),
            'r2': r2_score(y_test, y_test_pred)
        }
    }

    return model, metrics

In [47]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model_with_inverse_transform(model, X_test, y_test_scaled, scaler_y):
    """
    Evaluates a trained model on test data with inverse transformations applied.

    Parameters:
    - model: Trained model with a .predict() method.
    - X_test: Test features.
    - y_test_scaled: Scaled test targets.
    - scaler_y: Fitted StandardScaler used on target during training.

    Returns:
    - metrics: Dictionary with MAE, MSE, R² and optional per-feature metrics.
    - y_true_original: Target values in original units.
    - y_pred_original: Predicted values in original units.
    """
    # Predict
    y_pred_scaled = model.predict(X_test)

    # Inverse transform StandardScaler
    y_true_unscaled = scaler_y.inverse_transform(y_test_scaled)
    y_pred_unscaled = scaler_y.inverse_transform(y_pred_scaled)

    # Inverse log1p transform if needed
    y_true_original = np.expm1(y_true_unscaled)
    y_pred_original = np.expm1(y_pred_unscaled)
    
    # Compute overall metrics
    metrics = {
        "mae": mean_absolute_error(y_true_original, y_pred_original),
        "mse": mean_squared_error(y_true_original, y_pred_original),
        "r2": r2_score(y_true_original, y_pred_original),
    }

    # Optional: per-feature metrics
    if targets is not None:
        metrics["per_feature"] = {
            targets[i]: {
                "mae": mean_absolute_error(y_true_original[:, i], y_pred_original[:, i]),
                "mse": mean_squared_error(y_true_original[:, i], y_pred_original[:, i]),
                "r2": r2_score(y_true_original[:, i], y_pred_original[:, i]),
            }
            for i in range(len(targets))
        }

    return metrics


# Linear regressor

In [48]:
from sklearn.linear_model import LinearRegression

# Train and evaluate the linear regressor
model, metrics = train_and_evaluate_model(
    ModelClass=LinearRegression,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

def print_model_summary(metrics):
    print("Train Metrics:")
    for feature, metrics_dict in metrics['train']['per_feature'].items():
        print(f"Feature: {feature}")
        print('  Train MAE: %.3f' % metrics_dict['mae'])
        print('  Train MSE: %.3f' % metrics_dict['mse'])
        print('  Train R-squared: %.3f' % metrics_dict['r2'])

    print('MAE: %.3f' % metrics['train']['mae'])
    print('MSE: %.3f' % metrics['train']['mse'])
    print('R-squared: %.3f' % metrics['train']['r2'])

    print("\nTest Metrics:")
    for feature, metrics_dict in metrics['test']['per_feature'].items():
        print(f"Feature: {feature}")
        print('  Train MAE: %.3f' % metrics_dict['mae'])
        print('  Train MSE: %.3f' % metrics_dict['mse'])
        print('  Train R-Squared: %.3f' % metrics_dict['r2'])

    print('MAE: %.3f' % metrics['test']['mae'])
    print('MSE: %.3f' % metrics['test']['mse'])
    print('R-squared: %.3f' % metrics['test']['r2'])
    
print_model_summary(metrics)

Train Metrics:
Feature: userReviewsTotal
  Train MAE: 6.164
  Train MSE: 1822.804
  Train R-squared: 0.595
Feature: criticReviewsTotal
  Train MAE: 3.141
  Train MSE: 94.040
  Train R-squared: 0.611
MAE: 4.653
MSE: 958.422
R-squared: 0.603

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 5.877
  Train MSE: 721.100
  Train R-Squared: 0.812
Feature: criticReviewsTotal
  Train MAE: 3.339
  Train MSE: 131.068
  Train R-Squared: 0.623
MAE: 4.608
MSE: 426.084
R-squared: 0.718


# Ridge

In [60]:
from sklearn.linear_model import Ridge

# Train and evaluate the ridge regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Ridge,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    params_distribution={
        'alpha': [0.1, 1.0, 10.0, 100.0],
    }
)

print_model_summary(metrics)
model

Fitting 5 folds for each of 4 candidates, totalling 20 fits




Train Metrics:
Feature: userReviewsTotal
  Train MAE: 6.164
  Train MSE: 1822.804
  Train R-squared: 0.595
Feature: criticReviewsTotal
  Train MAE: 3.141
  Train MSE: 94.040
  Train R-squared: 0.611
MAE: 4.653
MSE: 958.422
R-squared: 0.603

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 5.877
  Train MSE: 721.099
  Train R-Squared: 0.812
Feature: criticReviewsTotal
  Train MAE: 3.339
  Train MSE: 131.069
  Train R-Squared: 0.623
MAE: 4.608
MSE: 426.084
R-squared: 0.718


# Lasso

In [61]:
from sklearn.linear_model import Lasso

# Train and evaluate the ridge regressor
model, metrics = train_and_evaluate_model(
    ModelClass=Lasso,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    params_distribution={
        'alpha': [0.1, 1.0, 10.0, 100.0],
    }
)

print_model_summary(metrics)
model



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Train Metrics:
Feature: userReviewsTotal
  Train MAE: 5.885
  Train MSE: 1824.612
  Train R-squared: 0.595
Feature: criticReviewsTotal
  Train MAE: 2.884
  Train MSE: 95.464
  Train R-squared: 0.605
MAE: 4.385
MSE: 960.038
R-squared: 0.600

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 5.624
  Train MSE: 722.818
  Train R-Squared: 0.812
Feature: criticReviewsTotal
  Train MAE: 3.113
  Train MSE: 137.847
  Train R-Squared: 0.604
MAE: 4.368
MSE: 430.333
R-squared: 0.708


# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor


# Train and evaluate the KNN regressor
model, metrics = train_and_evaluate_model(
    ModelClass=KNeighborsRegressor,
    X_train=X_train,
    y_train=y_train,  
    X_test=X_test,
    y_test=y_test,
    params_distribution={
        'n_neighbors': np.arange(1, 51),
    }
)

print_model_summary(metrics)
model

TypeError: KNeighborsRegressor.__init__() got an unexpected keyword argument 'alpha'

# DT

In [55]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint, uniform

# Define parameter distribution for RandomizedSearchCV
param_distributions = {
    'max_depth': randint(1, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    # 'max_leaf_nodes': randint(2, 20),
    'ccp_alpha': uniform(0.0, 0.1),
    'criterion': ['friedman_mse'],
}

model, metrics = train_and_evaluate_model(
    ModelClass=DecisionTreeRegressor,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    params_distribution=param_distributions
)
print_model_summary(metrics)
model

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Train Metrics:
Feature: userReviewsTotal
  Train MAE: 4.028
  Train MSE: 1779.749
  Train R-squared: 0.605
Feature: criticReviewsTotal
  Train MAE: 2.015
  Train MSE: 70.933
  Train R-squared: 0.706
MAE: 3.022
MSE: 925.341
R-squared: 0.656

Test Metrics:
Feature: userReviewsTotal
  Train MAE: 4.076
  Train MSE: 916.076
  Train R-Squared: 0.761
Feature: criticReviewsTotal
  Train MAE: 2.255
  Train MSE: 110.653
  Train R-Squared: 0.682
MAE: 3.166
MSE: 513.364
R-squared: 0.722


In [53]:
max(y_test['criticReviewsTotal'])

567

In [56]:
np.logspace(-4, 4, 50)

array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
       4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
       2.02358965e-03, 2.94705170e-03, 4.29193426e-03, 6.25055193e-03,
       9.10298178e-03, 1.32571137e-02, 1.93069773e-02, 2.81176870e-02,
       4.09491506e-02, 5.96362332e-02, 8.68511374e-02, 1.26485522e-01,
       1.84206997e-01, 2.68269580e-01, 3.90693994e-01, 5.68986603e-01,
       8.28642773e-01, 1.20679264e+00, 1.75751062e+00, 2.55954792e+00,
       3.72759372e+00, 5.42867544e+00, 7.90604321e+00, 1.15139540e+01,
       1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
       7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
       3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
       1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
       6.86648845e+03, 1.00000000e+04])