In [48]:
from support.df_init import init

train = init()
test = init('pp_test.csv')

In [49]:
# Get list of numerical features
numerical_feats = train.select_dtypes(include=['number']).columns
print("Numerical features:", list(numerical_feats))

# Calculate correlations with 'criticReviewsTotal' and sort
correlations = train[numerical_feats].corr()['criticReviewsTotal'].drop('criticReviewsTotal').sort_values(ascending=False)
correlations

Numerical features: ['rating', 'startYear', 'endYear', 'runtimeMinutes', 'awardWins', 'numVotes', 'totalImages', 'totalVideos', 'totalCredits', 'criticReviewsTotal', 'awardNominationsExcludeWins', 'numRegions', 'userReviewsTotal', 'is_Documentary', 'is_History', 'is_Adventure', 'is_Thriller', 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi', 'is_Romance', 'is_Biography', 'is_Musical', 'is_Western', 'is_Music', 'is_Film-Noir', 'is_Adult', 'is_Reality-TV', 'is_News', 'is_Action', 'is_Crime', 'is_Short', 'is_Fantasy', 'is_Family', 'is_Mystery', 'is_Talk-Show', 'is_Drama', 'is_Sport', 'is_War', 'is_Horror', 'is_Animation', 'is_from_Oceania', 'is_from_North America', 'is_from_South America', 'is_from_Asia', 'is_from_Africa', 'is_from_Europe', 'countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF', 'countryOfOrigin_AS', 'countryOfOrigin_EU', 'countryOfOrigin_OC', 'countryOfOrigin_SA', 'countryOfOrigin_UNK', 'fill_runtimeMinutes_Bruno', 'totalNominations', 'totalMedia', 'runtimeMinute

numVotes                       0.673614
numRegions                     0.658810
userReviewsTotal               0.645414
awardNominationsExcludeWins    0.507454
totalNominations               0.483278
awardWins                      0.356987
totalVideos                    0.334553
totalMedia                     0.265579
totalImages                    0.253869
totalCredits                   0.253060
fill_runtimeMinutes_Bruno      0.139566
runtimeMinutes                 0.137464
runtimeMinutes_notitletype     0.130886
is_Horror                      0.111012
countryOfOrigin_freq_enc       0.097695
is_Thriller                    0.096840
countryOfOrigin_NA             0.087582
is_from_North America          0.077097
is_Drama                       0.065677
is_Action                      0.060932
is_Mystery                     0.059723
countryOfOrigin_EU             0.058450
is_Film-Noir                   0.056176
is_Biography                   0.045094
is_Sci-Fi                      0.041341


In [50]:
train = train.loc[~train['outlier_w_type']]
test = test.loc[~test['outlier_w_type']]

In [51]:
import pandas as pd

train = pd.get_dummies(train, columns=['titleType'], prefix='titleType')
test = pd.get_dummies(test, columns=['titleType'], prefix='titleType')

In [52]:
train.columns

Index(['originalTitle', 'rating', 'startYear', 'endYear', 'runtimeMinutes',
       'awardWins', 'numVotes', 'totalImages', 'totalVideos', 'totalCredits',
       'criticReviewsTotal', 'awardNominationsExcludeWins', 'canHaveEpisodes',
       'numRegions', 'userReviewsTotal', 'is_Documentary', 'is_History',
       'is_Adventure', 'is_Thriller', 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi',
       'is_Romance', 'is_Biography', 'is_Musical', 'is_Western', 'is_Music',
       'is_Film-Noir', 'is_Adult', 'is_Reality-TV', 'is_News', 'is_Action',
       'is_Crime', 'is_Short', 'is_Fantasy', 'is_Family', 'is_Mystery',
       'is_Talk-Show', 'is_Drama', 'is_Sport', 'is_War', 'is_Horror',
       'is_Animation', 'is_from_Oceania', 'is_from_North America',
       'is_from_South America', 'is_from_Asia', 'is_from_Africa',
       'is_from_Europe', 'countryOfOrigin', 'countryOfOrigin_freq_enc',
       'countryOfOrigin_NA', 'countryOfOrigin_AF', 'countryOfOrigin_AS',
       'countryOfOrigin_EU', 'countryOfOr

In [53]:
# userReviewsTotal               0.732326
# numVotes                       0.698266
# numRegions                     0.609967
# totalMedia                     0.599626
# totalNominations               0.407327


feats = [
    'userReviewsTotal',
    # 'numVotes',
    # 'numRegions',
    # 'totalMedia',
]

In [54]:
from support.transformations import apply_transformations
train, test = apply_transformations(train, test)

In [55]:
from sklearn.preprocessing import StandardScaler

# Scale features
scaler_X = StandardScaler()
# train.loc[:, feats] = scaler_X.fit_transform(train[feats])
# test.loc[:, feats] = scaler_X.transform(test[feats])
train[feats] = scaler_X.fit_transform(train[feats])
test[feats] = scaler_X.transform(test[feats])

# Scale target variable separately
scaler_y = StandardScaler()
# train.loc[:, ['criticReviewsTotal']] = scaler_y.fit_transform(train[['criticReviewsTotal']])
# test.loc[:, ['criticReviewsTotal']] = scaler_y.transform(test[['criticReviewsTotal']])
train['criticReviewsTotal'] = scaler_y.fit_transform(train[['criticReviewsTotal']])
test['criticReviewsTotal'] = scaler_y.transform(test[['criticReviewsTotal']])

In [56]:
from support.transformations import to_log, apply_transformations

y_train = train['criticReviewsTotal']

y_test = test['criticReviewsTotal']

# X_train, X_test = apply_transformations(
#     train, test
# )

X_train = train[feats]
X_test = test[feats]

In [57]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

def train_regression_model(
    model_class,
    X_train, y_train,
    X_test, y_test,
    param_distributions=None,
    **model_params):
    """
    Trains a regression model and evaluates its performance.

    Parameters:
        model_class: The regression model class to be trained (e.g., sklearn.linear_model.LinearRegression).
        X_train: Training feature DataFrame.
        y_train: Training target Series.
        X_test: Testing feature DataFrame.
        y_test: Testing target Series.
        model_params: Additional parameters to pass to the model.

    Returns:
        A dictionary containing relative MAE, MSE, and R-squared for train and test sets.
    """
    # Initialize and train the model with additional parameters
    if param_distributions:
        model = RandomizedSearchCV(
            estimator=model_class(),
            param_distributions=param_distributions,
            n_iter=100,
            cv=10,
            random_state=42,
            n_jobs=-1,
            scoring='',
        )
        model.fit(X_train, y_train)
        model = model.best_estimator_
    else:
        model = model_class(**model_params)
        model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'train': {
            'mae': mean_absolute_error(y_train, y_train_pred),
            'mse': mean_squared_error(y_train, y_train_pred),
            'r2': r2_score(y_train, y_train_pred)
        },
        'test': {
            'mae': mean_absolute_error(y_test, y_test_pred),
            'mse': mean_squared_error(y_test, y_test_pred),
            'r2': r2_score(y_test, y_test_pred)
        }
    }
    
    return metrics, model

# Linear regressor

In [58]:
from sklearn.linear_model import LinearRegression

metrics, model = train_regression_model(
    LinearRegression,
    X_train, y_train,
    X_test, y_test
)
print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")

Train Metrics:
MAE: 0.42476832213462895
MSE: 0.4636981843728071
R-squared: 0.5363018156271928

Test Metrics:
MAE: 0.4318262158454747
MSE: 0.4739285795629918
R-squared: 0.5648058281798175


In [59]:
model.coef_

array([0.7323263])

# Ridge

In [60]:
from sklearn.linear_model import Ridge

metrics, ridge_model = train_regression_model(
    Ridge,
    X_train, y_train,
    X_test, y_test,
    param_distributions={'alpha': np.logspace(-4, 4, 100)},
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
ridge_model

InvalidParameterError: The 'scoring' parameter of RandomizedSearchCV must be a str among {'precision_weighted', 'precision', 'homogeneity_score', 'neg_mean_poisson_deviance', 'positive_likelihood_ratio', 'balanced_accuracy', 'neg_log_loss', 'neg_root_mean_squared_log_error', 'precision_micro', 'f1_micro', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'explained_variance', 'recall_micro', 'neg_mean_squared_error', 'adjusted_rand_score', 'neg_mean_absolute_error', 'top_k_accuracy', 'neg_mean_absolute_percentage_error', 'recall_macro', 'recall_weighted', 'roc_auc_ovo', 'neg_mean_squared_log_error', 'd2_absolute_error_score', 'jaccard', 'max_error', 'jaccard_macro', 'f1_samples', 'f1_weighted', 'jaccard_micro', 'r2', 'roc_auc_ovr_weighted', 'jaccard_samples', 'f1', 'average_precision', 'fowlkes_mallows_score', 'neg_negative_likelihood_ratio', 'precision_samples', 'recall', 'rand_score', 'precision_macro', 'completeness_score', 'v_measure_score', 'matthews_corrcoef', 'jaccard_weighted', 'mutual_info_score', 'neg_brier_score', 'normalized_mutual_info_score', 'recall_samples', 'accuracy', 'neg_mean_gamma_deviance', 'neg_median_absolute_error', 'adjusted_mutual_info_score', 'f1_macro', 'roc_auc', 'neg_root_mean_squared_error'}, a callable, an instance of 'list', an instance of 'tuple', an instance of 'dict' or None. Got '' instead.

# Lasso

In [None]:
from sklearn.linear_model import Lasso

metrics, lasso_model = train_regression_model(
    Lasso,
    X_train, y_train,
    X_test, y_test,
    param_distributions={
        'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
    }
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
lasso_model



Train Metrics:
MAE: 0.4294597644349438
MSE: 0.40492622543676915
R-squared: 0.5950737745632306

Test Metrics:
MAE: 0.43811752598319115
MSE: 0.42695996173356643
R-squared: 0.6079356785818834


# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

metrics, knn_model = train_regression_model(
    KNeighborsRegressor,
    X_train[feats], y_train,
    X_test[feats], y_test,
    param_distributions={
        'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
knn_model



Train Metrics:
MAE: 0.36593797808423284
MSE: 0.3263504038117164
R-squared: 0.6736495961882835

Test Metrics:
MAE: 0.3866844516589323
MSE: 0.3653989203891601
R-squared: 0.6644653067992212


# DT

In [None]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint, uniform

# Define parameter distribution for RandomizedSearchCV
param_distributions = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    # 'max_features': [None, 'sqrt', 'log2'],
    # 'max_leaf_nodes': randint(2, 20),
    'ccp_alpha': uniform(0.0, 0.1),
    'criterion': ['friedman_mse'],
}

# Train the Decision Tree model
dt_metrics, dt_model = train_regression_model(
    DecisionTreeRegressor,
    X_train[feats], y_train,
    X_test[feats], y_test,
    param_distributions=param_distributions
)

# Print metrics
print("Train Metrics:")
print(f"MAE: {dt_metrics['train']['mae']}")
print(f"MSE: {dt_metrics['train']['mse']}")
print(f"R-squared: {dt_metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {dt_metrics['test']['mae']}")
print(f"MSE: {dt_metrics['test']['mse']}")
print(f"R-squared: {dt_metrics['test']['r2']}")
dt_model

Train Metrics:
MAE: 0.38367910358393703
MSE: 0.3404863842780144
R-squared: 0.6595136157219854

Test Metrics:
MAE: 0.3948661990183555
MSE: 0.3701772539310233
R-squared: 0.6600775087256185


In [None]:
dt_model.feature_importances_

array([0.75360487, 0.17616249, 0.07023264])