In [1]:
from support.df_init import init

train = init()
test = init('pp_test.csv')

In [26]:
train.columns

Index(['originalTitle', 'rating', 'startYear', 'endYear', 'runtimeMinutes',
       'awardWins', 'numVotes', 'totalImages', 'totalVideos', 'totalCredits',
       'criticReviewsTotal', 'awardNominationsExcludeWins', 'canHaveEpisodes',
       'isAdult', 'numRegions', 'userReviewsTotal', 'is_Documentary',
       'is_History', 'is_Adventure', 'is_Thriller', 'is_Game-Show',
       'is_Comedy', 'is_Sci-Fi', 'is_Romance', 'is_Biography', 'is_Musical',
       'is_Western', 'is_Music', 'is_Film-Noir', 'is_Adult', 'is_Reality-TV',
       'is_News', 'is_Action', 'is_Crime', 'is_Short', 'is_Fantasy',
       'is_Family', 'is_Mystery', 'is_Talk-Show', 'is_Drama', 'is_Sport',
       'is_War', 'is_Horror', 'is_Animation', 'fill_runtimeMinutes',
       'is_from_Oceania', 'is_from_North America', 'is_from_South America',
       'is_from_Asia', 'is_from_Africa', 'is_from_Europe', 'countryOfOrigin',
       'countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF',
       'countryOfOrigin_AS', 

In [25]:
import pandas as pd

train = pd.get_dummies(train, columns=['titleType'], prefix='titleType')
test = pd.get_dummies(test, columns=['titleType'], prefix='titleType')

In [54]:
feats = [
    'startYear', 'numVotes', 'totalCredits',
    'numRegions', 'userReviewsTotal', 'rating',
    # 'is_Documentary', 'is_History', 'is_Adventure', 'is_Thriller',
    # 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi', 'is_Romance', 'is_Biography',
    # 'is_Musical', 'is_Western', 'is_Music', 'is_Film-Noir', 'is_Adult',
    # 'is_Reality-TV', 'is_News', 'is_Action', 'is_Crime', 'is_Short',
    # 'is_Fantasy', 'is_Family', 'is_Mystery', 'is_Talk-Show', 'is_Drama',
    # 'is_Sport', 'is_War', 'is_Horror', 'is_Animation',
    'countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF',
    'countryOfOrigin_AS', 'countryOfOrigin_EU', 'countryOfOrigin_OC', 'countryOfOrigin_SA',
    'countryOfOrigin_UNK', 'fill_runtimeMinutes_Bruno', 'totalNominations',
    'totalMedia', 'titleType_movie', 'titleType_short',
    'titleType_tvEpisode', 'titleType_tvMiniSeries', 'titleType_tvMovie',
    'titleType_tvSeries', 'titleType_tvShort', 'titleType_tvSpecial',
    'titleType_video'
    # 'runtimeMinutes_notitletype', 'awardNominationsExcludeWins',
    # 'awardWins', 'fill_runtimeMinutes', 'totalImages', 'totalVideos',
    # 'is_from_Oceania', 'is_from_North America', 'is_from_South America',
    # 'is_from_Asia', 'is_from_Africa', 'is_from_Europe',
]

In [55]:
from support.transformations import to_log, apply_transformations

X_train = train[feats]
y_train = train['criticReviewsTotal']

X_test = test[feats]
y_test = test['criticReviewsTotal']

X_train, X_test = apply_transformations(
    X_train, X_test
)

In [56]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [81]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV

def train_regression_model(
    model_class,
    X_train, y_train,
    X_test, y_test,
    param_distributions=None,
    **model_params):
    """
    Trains a regression model and evaluates its performance.

    Parameters:
        model_class: The regression model class to be trained (e.g., sklearn.linear_model.LinearRegression).
        X_train: Training feature DataFrame.
        y_train: Training target Series.
        X_test: Testing feature DataFrame.
        y_test: Testing target Series.
        model_params: Additional parameters to pass to the model.

    Returns:
        A dictionary containing relative MAE, MSE, and R-squared for train and test sets.
    """
    # Initialize and train the model with additional parameters
    if param_distributions:
        model = RandomizedSearchCV(
            estimator=model_class(),
            param_distributions=param_distributions,
            n_iter=100,
            cv=5,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        model = model.best_estimator_
    else:
        model = model_class(**model_params)
        model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'train': {
            'mae': mean_absolute_error(y_train, y_train_pred),
            'mse': mean_squared_error(y_train, y_train_pred),
            'r2': r2_score(y_train, y_train_pred)
        },
        'test': {
            'mae': mean_absolute_error(y_test, y_test_pred),
            'mse': mean_squared_error(y_test, y_test_pred),
            'r2': r2_score(y_test, y_test_pred)
        }
    }
    
    return metrics, model

# Linear regressor

In [None]:
from sklearn.linear_model import LinearRegression

metrics, model = train_regression_model(
    LinearRegression,
    X_train, y_train,
    X_test, y_test
)
print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")

Train Metrics:
Relative MAE: 3.6857834376546603
Relative MSE: 98.4394844548154
R-squared: 0.5915353923704139

Test Metrics:
Relative MAE: 3.8530550845325693
Relative MSE: 135.2235876339985
R-squared: 0.6101357360271473


In [None]:
from sklearn.linear_model import LinearRegression

metrics, model = train_regression_model(
    LinearRegression,
    X_train_scaled, y_train,
    X_test_scaled, y_test
)
print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")

Train Metrics:
Relative MAE: 3.685374835299085
Relative MSE: 98.43954607957053
R-squared: 0.5915351366647732

Test Metrics:
Relative MAE: 3.852479128005516
Relative MSE: 135.21982268754442
R-squared: 0.6101465907759667


# Ridge

In [None]:
from sklearn.linear_model import Ridge

metrics, ridge_model = train_regression_model(
    Ridge,
    X_train, y_train,
    X_test, y_test,
    param_distributions={
        'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
    }
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
ridge_model



Ridge Regressor Train Metrics:
Relative MAE: 3.6645905051241847
Relative MSE: 98.45784943572525
R-squared: 0.5914591887539178

Ridge Regressor Test Metrics:
Relative MAE: 3.828208663898565
Relative MSE: 135.1786916142347
R-squared: 0.6102651761197139


In [None]:
from sklearn.linear_model import Ridge

metrics, ridge_model = train_regression_model(
    Ridge,
    X_train_scaled, y_train,
    X_test_scaled, y_test,   
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")

Ridge Regressor Train Metrics:
Relative MAE: 3.6857513275398714
Relative MSE: 98.43948477995677
R-squared: 0.5915353910212728

Ridge Regressor Test Metrics:
Relative MAE: 3.853040057166315
Relative MSE: 135.22569990545117
R-squared: 0.6101296461195387


# Lasso

In [None]:
from sklearn.linear_model import Lasso

metrics, lasso_model = train_regression_model(
    Lasso,
    X_train, y_train,
    X_test, y_test,
    param_distributions={
        'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
    }
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
lasso_model



Lasso Regressor Train Metrics:
Relative MAE: 3.5732122155717865
Relative MSE: 99.07478803517893
R-squared: 0.5888992649148909

Lasso Regressor Test Metrics:
Relative MAE: 3.7260838354365484
Relative MSE: 135.65165273772942
R-squared: 0.6089015779226439


In [None]:
from sklearn.linear_model import Lasso

metrics, lasso_model = train_regression_model(
    Lasso,
    X_train_scaled, y_train,
    X_test_scaled, y_test
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")

Lasso Regressor Train Metrics:
Relative MAE: 3.266747338695367
Relative MSE: 102.2246238796078
R-squared: 0.5758293421149314

Lasso Regressor Test Metrics:
Relative MAE: 3.442057366549745
Relative MSE: 143.98602973601587
R-squared: 0.584872665430662


# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

metrics, knn_model = train_regression_model(
    KNeighborsRegressor,
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    param_distributions={
        'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
)

print("Train Metrics:")
print(f"MAE: {metrics['train']['mae']}")
print(f"MSE: {metrics['train']['mse']}")
print(f"R-squared: {metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {metrics['test']['mae']}")
print(f"MSE: {metrics['test']['mse']}")
print(f"R-squared: {metrics['test']['r2']}")
knn_model



KNN Regressor Train Metrics:
Relative MAE: 0.0
Relative MSE: 0.0
R-squared: 1.0

KNN Regressor Test Metrics:
Relative MAE: 1.8157611397408324
Relative MSE: 79.59022916606156
R-squared: 0.7705327402106604


# DT

In [82]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint, uniform

# Define parameter distribution for RandomizedSearchCV
param_distributions = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': [None, 'sqrt', 'log2'],
    # 'max_leaf_nodes': randint(2, 20),
    'ccp_alpha': uniform(0.0, 0.1),
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
}

# Train the Decision Tree model
dt_metrics, dt_model = train_regression_model(
    DecisionTreeRegressor,
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    param_distributions=param_distributions
)

# Print metrics
print("Train Metrics:")
print(f"MAE: {dt_metrics['train']['mae']}")
print(f"MSE: {dt_metrics['train']['mse']}")
print(f"R-squared: {dt_metrics['train']['r2']}")
print("\nTest Metrics:")
print(f"MAE: {dt_metrics['test']['mae']}")
print(f"MSE: {dt_metrics['test']['mse']}")
print(f"R-squared: {dt_metrics['test']['r2']}")
dt_model

Train Metrics:
MAE: 1.5537617792623677
MSE: 22.105882345277266
R-squared: 0.9082738942764975

Test Metrics:
MAE: 2.1824497168411923
MSE: 110.14339468005633
R-squared: 0.68244465148612
