In [7]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from torch.utils.tensorboard import SummaryWriter

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

simplefilter("ignore", category=ConvergenceWarning)

if not os.path.exists('results'):
    os.mkdir('results')

if not os.path.exists('results/pca'):
    os.mkdir('results/pca')

if not os.path.exists('results/no_pca'):
    os.mkdir('results/no_pca')

## Classic Machine Learning Techniques

In [8]:
is_pca = True

if is_pca:
    path = 'runs/classicMl/pca/'
    results_path = 'results/pca/'
else:
    path = 'runs/classicMl/no_pca/'
    results_path = 'results/no_pca/'

In [9]:
seed = 42
np.random.seed(seed)

df = pd.read_csv('dataset.csv')
display(df.head())
print(f'Number of samples: {df.shape[0]}')

Unnamed: 0,rating_mean,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,3.893708,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0,0,0,0,0,0,0,0,0,0
1,3.251527,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0,0,0,0,0,0,0,0,0,0
2,3.142028,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0,0,0,0,0,1,0,0,0,0
3,2.853547,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0,0,0,0,0,1,0,0,0,0
4,3.058434,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0,0,0,0,0,0,0,0,0,0


Number of samples: 13798


### Data Preprocessing

In [10]:
X = df.drop('rating_mean', axis=1)
y = df['rating_mean']

# Train Test and Validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=seed)

print('Number of training samples:', X_train.shape)
print('Number of testing samples:', X_test.shape)
print('Number of validation samples:', X_val.shape)

Number of training samples: (9934, 1148)
Number of testing samples: (2760, 1148)
Number of validation samples: (1104, 1148)


In [11]:
if is_pca:
    print("Applying PCA...")
    pca = PCA(n_components=0.95)
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_val = pca.transform(X_val)
    X_test = pca.transform(X_test)
    print(f'Number of features after PCA: {X_train.shape[1]}')
else:
    print("PCA is not applied")

Applying PCA...
Number of features after PCA: 543


# Models
### Linear Regression

In [6]:
writer = SummaryWriter(path + 'linear_regression')

regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

writer.add_scalar('Loss', mse)
writer.flush()

print('Linear Regression Results:')
print('Mean Square Error:', mse)
print('R2 Score:', r2)

open(results_path + 'linear_regression.txt', 'w').write(f'Mean Square Error: {mse}\nR2 Score: {r2}')

Linear Regression Results:
Mean Square Error: 0.0063633984275459
R2 Score: 0.971788385235882


65

### Ridge Regression

In [40]:
from sklearn.linear_model import Ridge

parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
ridge_regressor = Ridge()

grid_search = GridSearchCV(ridge_regressor, parameters, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

print('Grid Search Results:')
print('Best Parameters:', grid_search.best_params_)
print('Best Score (negative MSE) :', grid_search.best_score_)

Grid Search Results:
Best Parameters: {'alpha': 10}
Best Score (negative MSE) : -0.006438020763516988


In [41]:
ridge_regressor = Ridge(alpha=grid_search.best_params_['alpha'])
ridge_regressor.fit(X_train, y_train)

y_pred = ridge_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Ridge Regression Results:')
print('Mean Square Error:', mse)
print('R2 Score:', r2)

Ridge Regression Results:
Mean Square Error: 0.006312944962890811
R2 Score: 0.9720120666106334


In [42]:
writer = SummaryWriter(path + 'ridge_regression')

writer.add_hparams(grid_search.best_params_, {'mse': mse})
writer.flush()

open(results_path + 'ridge_regression.txt', 'w').write(
    f'Mean Square Error: {mse}\nR2 Score: {r2}, \nBest Params: {grid_search.best_params_}')

98

### Lasso Regression

In [22]:
from sklearn.linear_model import Lasso

parameters = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}

lasso_regressor = Lasso()

grid_search = GridSearchCV(lasso_regressor, parameters, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

print('Grid Search Results:')
print('Best Parameters:', grid_search.best_params_)
print('Best Score (negative MSE) :', grid_search.best_score_)

Grid Search Results:
Best Parameters: {'alpha': 0.0001}
Best Score (negative MSE) : -0.006782228799314971


In [23]:
lasso_regressor = Lasso(alpha=grid_search.best_params_['alpha'])
lasso_regressor.fit(X_train, y_train)

y_pred = lasso_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Lasso Regression Results:')
print('Mean Square Error:', mse)
print('R2 Score:', r2)

Lasso Regression Results:
Mean Square Error: 0.006567002523988718
R2 Score: 0.9708857228615796


In [24]:
writer = SummaryWriter(path + 'lasso_regression')

writer.add_hparams(grid_search.best_params_, {'mse': mse})

open(results_path + 'lasso_regression.txt', 'w').write(
    f'Mean Square Error: {mse}\nR2 Score: {r2}\nBest Params: {grid_search.best_params_}')

68

### Elastic Net Regression

In [25]:
from sklearn.linear_model import ElasticNet

param_grid = {'alpha': [1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
              'l1_ratio': [0.3, 0.5, 0.7]}

elastic_net_regressor = ElasticNet()

grid_search = GridSearchCV(elastic_net_regressor, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

print('Grid Search Results:')
print('Best Parameters:', grid_search.best_params_)
print('Best Score (negative MSE) :', grid_search.best_score_)

Grid Search Results:
Best Parameters: {'alpha': 0.0001, 'l1_ratio': 0.3}
Best Score (negative MSE) : -0.006434737180532661


In [26]:
elastic_net_regressor = ElasticNet(alpha=grid_search.best_params_['alpha'],
                                   l1_ratio=grid_search.best_params_['l1_ratio'])
elastic_net_regressor.fit(X_train, y_train)

y_pred = elastic_net_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Elastic Net Regression Results:')
print('Mean Square Error:', mse)
print('R2 Score:', r2)

Elastic Net Regression Results:
Mean Square Error: 0.006322556484543574
R2 Score: 0.9719694547029787


In [27]:
writer = SummaryWriter(path + 'elastic_net_regression')

writer.add_hparams(grid_search.best_params_, {'mse': mse})

open(results_path + 'elastic_net_regression.txt', 'w').write(
    f'Mean Square Error: {mse}\nR2 Score: {r2}\nBest Params: {grid_search.best_params_}')

68

### Random Forest Regressor

In [28]:
from sklearn.ensemble import RandomForestRegressor

pram_grid = {'n_estimators': [50, 100, 150],
             'max_depth': [5, 10],
             'criterion': ['squared_error', 'friedman_mse', 'poisson']}

random_forest_regressor = RandomForestRegressor()

grid_search = GridSearchCV(random_forest_regressor, pram_grid, scoring='neg_mean_squared_error', cv=2, verbose=3)
grid_search.fit(X_train, y_train)

print('Grid Search Results:')
print('Best Parameters:', grid_search.best_params_)
print('Best Score (negative MSE) :', grid_search.best_score_)

Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV 1/2] END criterion=squared_error, max_depth=None, n_estimators=10;, score=-0.053 total time=  24.6s
[CV 2/2] END criterion=squared_error, max_depth=None, n_estimators=10;, score=-0.053 total time=  23.5s
[CV 1/2] END criterion=squared_error, max_depth=None, n_estimators=50;, score=-0.047 total time= 2.1min
[CV 2/2] END criterion=squared_error, max_depth=None, n_estimators=50;, score=-0.047 total time= 2.0min
[CV 1/2] END criterion=squared_error, max_depth=None, n_estimators=100;, score=-0.046 total time= 4.1min
[CV 2/2] END criterion=squared_error, max_depth=None, n_estimators=100;, score=-0.045 total time= 3.9min
[CV 1/2] END criterion=squared_error, max_depth=5, n_estimators=10;, score=-0.065 total time=  10.8s
[CV 2/2] END criterion=squared_error, max_depth=5, n_estimators=10;, score=-0.064 total time=  10.9s
[CV 1/2] END criterion=squared_error, max_depth=5, n_estimators=50;, score=-0.063 total time=  53.7s
[CV 2/2] E

In [29]:
random_forest_regressor = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
                                                max_depth=grid_search.best_params_['max_depth'],
                                                criterion=grid_search.best_params_['criterion'])
random_forest_regressor.fit(X_train, y_train)

y_pred = random_forest_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Random Forest Regression Results:')
print('Mean Square Error:', mse)
print('R2 Score:', r2)

Random Forest Regression Results:
Mean Square Error: 0.03765388632790462
R2 Score: 0.8330645255754618


In [30]:
writer = SummaryWriter(path + 'random_forest_regression')

writer.add_hparams(grid_search.best_params_, {'mse': mse})

open(results_path + 'random_forest_regression.txt', 'w').write(
    f'Mean Square Error: {mse}\nR2 Score: {r2}\nBest Params: {grid_search.best_params_}')

67

### Support Vector Regressor

In [31]:
from sklearn.svm import SVR

param_grid = {'C': [0.001, 0.01, 0.1, 1],
              'epsilon': [0.001, 0.01, 0.1, 1],
              'kernel': ['linear', 'poly', 'rbf']}

svr_regressor = SVR()

grid_search = GridSearchCV(svr_regressor, parameters, scoring='neg_mean_squared_error', cv=2, verbose=3)
grid_search.fit(X_train, y_train)

print('Grid Search Results:')
print('Best Parameters:', grid_search.best_params_)
print('Best Score (negative MSE) :', grid_search.best_score_)

Fitting 2 folds for each of 48 candidates, totalling 96 fits
[CV 1/2] END C=0.001, epsilon=0.001, kernel=linear;, score=-0.015 total time=   8.7s
[CV 2/2] END C=0.001, epsilon=0.001, kernel=linear;, score=-0.014 total time=   8.7s
[CV 1/2] END C=0.001, epsilon=0.001, kernel=poly;, score=-0.209 total time=   8.9s
[CV 2/2] END C=0.001, epsilon=0.001, kernel=poly;, score=-0.190 total time=   8.9s
[CV 1/2] END C=0.001, epsilon=0.001, kernel=rbf;, score=-0.167 total time=  14.3s
[CV 2/2] END C=0.001, epsilon=0.001, kernel=rbf;, score=-0.154 total time=  14.3s
[CV 1/2] END C=0.001, epsilon=0.01, kernel=linear;, score=-0.015 total time=   8.0s
[CV 2/2] END C=0.001, epsilon=0.01, kernel=linear;, score=-0.014 total time=   8.0s
[CV 1/2] END C=0.001, epsilon=0.01, kernel=poly;, score=-0.209 total time=   8.7s
[CV 2/2] END C=0.001, epsilon=0.01, kernel=poly;, score=-0.190 total time=   8.8s
[CV 1/2] END C=0.001, epsilon=0.01, kernel=rbf;, score=-0.167 total time=  14.0s
[CV 2/2] END C=0.001, epsi

In [32]:
svr_regressor = SVR(C=grid_search.best_params_['C'],
                    epsilon=grid_search.best_params_['epsilon'],
                    kernel=grid_search.best_params_['kernel'])

svr_regressor.fit(X_train, y_train)

y_pred = svr_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Support Vector Regression Results:')
print('Mean Square Error:', mse)
print('R2 Score:', r2)

Support Vector Regression Results:
Mean Square Error: 0.006360206657741281
R2 Score: 0.9718025356904181


In [33]:
writer = SummaryWriter(path + 'support_vector_regression')

writer.add_hparams(grid_search.best_params_, {'mse': mse})

open(results_path + 'support_vector_regression.txt', 'w').write(
    f'Mean Square Error: {mse}\nR2 Score: {r2}\nBest Params: {grid_search.best_params_}')

68

### K-Nearest Neighbors

In [43]:
from sklearn.neighbors import KNeighborsRegressor

param_grid = {'n_neighbors': np.arange(1, 21),
              'weights': ['uniform', 'distance']}

knn_regressor = KNeighborsRegressor()

grid_search = GridSearchCV(knn_regressor, param_grid, scoring='neg_mean_squared_error', cv=2, verbose=3)

grid_search.fit(X_train, y_train)

print('Grid Search Results:')
print('Best Parameters:', grid_search.best_params_)
print('Best Score (negative MSE) :', grid_search.best_score_)

Fitting 2 folds for each of 40 candidates, totalling 80 fits
[CV 1/2] END ...n_neighbors=1, weights=uniform;, score=-0.088 total time=   0.1s
[CV 2/2] END ...n_neighbors=1, weights=uniform;, score=-0.091 total time=   0.1s
[CV 1/2] END ..n_neighbors=1, weights=distance;, score=-0.088 total time=   0.1s
[CV 2/2] END ..n_neighbors=1, weights=distance;, score=-0.091 total time=   0.1s
[CV 1/2] END ...n_neighbors=2, weights=uniform;, score=-0.065 total time=   0.0s
[CV 2/2] END ...n_neighbors=2, weights=uniform;, score=-0.066 total time=   0.1s
[CV 1/2] END ..n_neighbors=2, weights=distance;, score=-0.064 total time=   0.1s
[CV 2/2] END ..n_neighbors=2, weights=distance;, score=-0.066 total time=   0.1s
[CV 1/2] END ...n_neighbors=3, weights=uniform;, score=-0.058 total time=   0.2s
[CV 2/2] END ...n_neighbors=3, weights=uniform;, score=-0.059 total time=   0.1s
[CV 1/2] END ..n_neighbors=3, weights=distance;, score=-0.058 total time=   0.1s
[CV 2/2] END ..n_neighbors=3, weights=distance;,

In [44]:
knn_regressor = KNeighborsRegressor(n_neighbors=grid_search.best_params_['n_neighbors'],
                                    weights=grid_search.best_params_['weights'])

knn_regressor.fit(X_train, y_train)

y_pred = knn_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('K-Nearest Neighbors Regression Results:')
print('Mean Square Error:', mse)
print('R2 Score:', r2)

K-Nearest Neighbors Regression Results:
Mean Square Error: 0.04504755863379536
R2 Score: 0.8002852744943125


In [47]:
writer = SummaryWriter(path + 'knn_regression')

writer.add_hparams(grid_search.best_params_, {'mse': mse})

open(results_path + 'knn_regression.txt', 'w').write(
    f'Mean Square Error: {mse}\nR2 Score: {r2}\nBest Params: {grid_search.best_params_}')


ValueError: value should be one of int, float, str, bool, or torch.Tensor

In [None]:
writer.close()