In [86]:
import pandas as pd
import re
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve, validation_curve 
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression, ElasticNet
from sklearn.metrics import mean_squared_error as mse
from PIL import Image
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

plt.style.use('ggplot')

In [87]:
# Import data
fp_data = Path.cwd()
file_path = fp_data/ 'data_processed.csv'
df = pd.read_csv(file_path)

# Create X variables as description, title and other variables
X_des = df['proc_de']
X_tit = df['proc_ti']
X_num = df.drop(columns = ['proc_de', 'proc_ti', 'Avg_ratings'])

# Create y variable as average ratings 
y = df['Avg_ratings']

In [88]:
# Split into development (2/3) and test data (1/3)
X_dev_tit, X_test_tit, y_dev, y_test = train_test_split(X_tit, y, test_size=1/3, random_state=1234)
X_dev_des, X_test_des = train_test_split(X_des, test_size=1/3, random_state=1234)
X_dev_num, X_test_num = train_test_split(X_num, test_size=1/3, random_state=1234)

## Lasso

In [89]:
# Hyperparameter list for all X's 
lambdas = np.logspace(-4, 4, 10)
degrees = [1,2,3]

In [90]:
# Pipeline for processed titles and description
pipe_words_las = make_pipeline(TfidfVectorizer(),
    Lasso(random_state=1234))

# param grid used for titles and description
param_words_las = {'lasso__alpha': lambdas}
   
# Gridsearch for processed titles and description
gs_lasso_title = GridSearchCV(pipe_words_las, param_words_las, scoring='neg_mean_squared_error', cv=3,  return_train_score=True)
gs_lasso_des = GridSearchCV(pipe_words_las, param_words_las, scoring='neg_mean_squared_error', cv=3,  return_train_score=True)

In [91]:
# Pipeline for numerical variables
pipe_num_las = make_pipeline(PolynomialFeatures(include_bias=True),
    StandardScaler(),
    Lasso(random_state=1234))

# param grid used for titles and description
param_num_las = {'polynomialfeatures__degree': degrees,
            'lasso__alpha': lambdas}

# Gridsearch for numerical variables
gs_lasso_num = GridSearchCV(pipe_num_las, param_num_las, scoring='neg_mean_squared_error', cv=3,  return_train_score=True)

In [92]:
# append result to a list
las_result = []

# Fit title words
gs_lasso_title.fit(X_dev_tit, y_dev)
las_result.append(gs_lasso_title)

# Fit description words
gs_lasso_des.fit(X_dev_des, y_dev)
las_result.append(gs_lasso_des)

# Fit numerical values
gs_lasso_num.fit(X_dev_num, y_dev)
las_result.append(gs_lasso_num)

In [149]:
las_models = ['Lasso_title', 'Lasso_des', 'Lasso_num']
las_dataset = [X_dev_tit, X_dev_des, X_dev_num]
accu_las = []
mse_las = []
para_las = []

#prints parameters, mse and accuracy for the best model
for i, name in enumerate(las_models):
    para = las_result[i].best_params_ #best hyperparameters
    para_las.append(para)
    
    mse_l = -las_result[i].best_score_ #best mse 
    mse_las.append(mse_l)
    
    y_hat = las_result[i].predict(las_dataset[i]) # Use the best model to make predictions
    accuracy = np.mean(y_dev.round(1)==y_hat.round(1)) #accuracy of the best model
    accu_las.append(accuracy)
    
    print(f'{name} Best parameter set: {para}', 
          f'Best mse: {mse_l}',
          f'Best accuracy: {accuracy}')

## Elasticnet

In [94]:
# Hyperparameter list for all X's 
lambdas = np.logspace(-4, 4, 10)
degrees = [1,2,3]
ela__l1_ratio = np.linspace(0, 1, 10)

In [95]:
# Pipeline for processed titles and description
pipe_words_elas = make_pipeline(TfidfVectorizer(),
                                ElasticNet(random_state=1234))

# param grid used for titles and description
param_words_elas = {'elasticnet__alpha': lambdas,
                   'elasticnet__l1_ratio': ela__l1_ratio}
   
# Gridsearch for processed titles and description
gs_elas_title = GridSearchCV(pipe_words_elas, param_words_elas, scoring='neg_mean_squared_error', cv=3,  return_train_score=True)
gs_elas_des = GridSearchCV(pipe_words_elas, param_words_elas, scoring='neg_mean_squared_error', cv=3,  return_train_score=True)

In [96]:
# Pipeline for numerical variables
pipe_num_ela = make_pipeline(PolynomialFeatures(include_bias=True),
    StandardScaler(),
    ElasticNet(random_state=1234))

# param grid used for titles and description
param_num_ela = {'polynomialfeatures__degree': degrees,
                'elasticnet__alpha': lambdas,
                'elasticnet__l1_ratio': ela__l1_ratio}

# Gridsearch for numerical variables
gs_elas_num = GridSearchCV(pipe_num_ela, param_num_ela, scoring='neg_mean_squared_error', cv=3,  return_train_score=True)

In [97]:
# append result to a list
elastic_result = []

# Fit title words
gs_elas_title.fit(X_dev_tit, y_dev)
elastic_result.append(gs_elas_title)

# Fit description words
gs_elas_des.fit(X_dev_des, y_dev)
elastic_result.append(gs_elas_des)

# Fit numerical values
gs_elas_num.fit(X_dev_num, y_dev)
elastic_result.append(gs_elas_num)

In [150]:
elas_models = ['Elastic_title', 'Elastic_des', 'Elastic_num']
elas_dataset = [X_dev_tit, X_dev_des, X_dev_num]
accu_elas = []
mse_elas = []
para_elas = []

#prints parameters, mse and accuracy for the best model
for i, name in enumerate(elas_models):
    mse_e = -elastic_result[i].best_score_
    mse_elas.append(mse_e)
    
    y_hat = elastic_result[i].predict(elas_dataset[i])
    accuracy = np.mean(y_dev.round(1)==y_hat.round(1))
    accu_elas.append(accuracy)
                    
    para = elastic_result[i].best_params_
    para_elas.append(para)
    
    print(f'{name} Best parameter set: {para}', 
          f'Best mse: {mse_e}',
          f'Best accuracy: {accuracy}')                  

## OLS

In [99]:
# Hyperparameter list for all X's 
degrees = [1,2,3]

In [116]:
# K-fold on OLS
## Titel

kfold = KFold(n_splits=3)
folds_tit = list(kfold.split(X_dev_tit, y_dev))
mseTit = []
accTit = []

for train_idx, val_idx in folds_tit:
    pipe_words_ols = make_pipeline(TfidfVectorizer(),
                                LinearRegression())
    X_train_tit, y_train = X_dev_tit.iloc[train_idx], y_dev[train_idx]
    X_val_tit, y_val = X_dev_tit.iloc[val_idx], y_dev[val_idx]
    pipe_words_ols.fit(X_train_tit, y_train)
    
    mseTit.append(mse(pipe_words_ols.predict(X_val_tit), y_val))
    accTit.append(np.mean(pipe_words_ols.predict(X_val_tit).round(1)==y_val.round(1)))

In [None]:
kfold = KFold(n_splits=3)
folds_des = list(kfold.split(X_dev_des, y_dev))
mseDes = []
accDes = []

for train_idx, val_idx in folds_des:
    pipe_words_ols = make_pipeline(TfidfVectorizer(),
                                LinearRegression())
    X_train_des, y_train = X_dev_des.iloc[train_idx], y_dev[train_idx]
    X_val_des, y_val = X_dev_des.iloc[val_idx], y_dev[val_idx]
    pipe_words_ols.fit(X_train_des, y_train)
    
    mseDes.append(mse(pipe_words_ols.predict(X_val_des), y_val))
    accDes.append(np.mean(pipe_words_ols.predict(X_val_des).round(1)==y_val.round(1)))

In [117]:
# Pipeline for numerical variables
pipe_num_ols = make_pipeline(PolynomialFeatures(include_bias=True),
    StandardScaler(),
    LinearRegression())

# param grid used for titles and description
param_num_ols = {'polynomialfeatures__degree': degrees}

# Gridsearch for numerical variables
gs_ols_num = GridSearchCV(pipe_num_ols, param_num_ols, scoring='neg_mean_squared_error', cv=3, return_train_score=True)

In [151]:
# Fit numerical values
gs_ols_num.fit(X_dev_num, y_dev)

In [152]:
accu_ols = []
mse_ols = []

# Title: 
MSE_tit = np.mean(mseTit)
accuracy_tit = np.mean(accTit) 

# Description
MSE_des = np.mean(mseDes)
accuracy_des =  np.mean(accDes) 

#Numerical variables

y_hat_num = gs_ols_num.predict(X_dev_num)
MSE_num = -gs_ols_num.best_score_
accuracy_num = np.mean(y_dev.round(1)==y_hat_num.round(1))

#Append MSE and accuracy to list
mse_ols.append(MSE_tit)
mse_ols.append(MSE_des)
mse_ols.append(MSE_num)

accu_ols.append(accuracy_tit)
accu_ols.append(accuracy_des)
accu_ols.append(accuracy_num)

para_ols = gs_ols_num.best_params_ #dictionary

ols_models = ['OLS_title', 'OLS_des', 'OLS_num']

#prints mse and accuracy for the best model and best parameters for numerical variables
for i, name in enumerate(ols_models):
    print(f'{name}Best mse: {mse_ols[i]}',
          f'Best accuracy: {accu_ols[i]}')
    
print(f'{ols_models[2]} Best parameter set: {gs_ols_num.best_params_}')

## Lasso - test

In [120]:
#Title pipeline
pipe_tit_las_best = make_pipeline(TfidfVectorizer(),
    Lasso(alpha = para_las[0]['lasso__alpha'], random_state=1234))

#Description pipeline
pipe_des_las_best = make_pipeline(TfidfVectorizer(),
    Lasso(alpha = para_las[1]['lasso__alpha'], random_state=1234))

#Numerical pipeline
pipe_num_las_best = make_pipeline(
    PolynomialFeatures(degree = para_las[2]['polynomialfeatures__degree'], include_bias=True),
    StandardScaler(),
    Lasso(alpha = para_las[2]['lasso__alpha'], random_state=1234))


In [153]:
#Fit to training data
# Fit title words
pipe_tit_las_best.fit(X_dev_tit, y_dev)

# Fit description words
pipe_des_las_best.fit(X_dev_des, y_dev)

# Fit numerical values
pipe_num_las_best.fit(X_dev_num, y_dev)

In [154]:
#Predict on test data
accu_las_test = []
mse_las_test = []

# Words: 
y_hat_tit_test_las = pipe_tit_las_best.predict(X_test_tit)
MSE_tit_test_las = mse(y_hat_tit_test_las,y_test)
accuracy_tit_test_las =  np.mean(y_test.round(1)==y_hat_tit_test_las.round(1)) 

y_hat_des_test_las = pipe_des_las_best.predict(X_test_des)
MSE_des_test_las = mse(y_hat_des_test_las,y_test)
accuracy_des_test_las =  np.mean(y_test.round(1)==y_hat_des_test_las.round(1)) 

#numerical variables

y_hat_num_test_las = pipe_num_las_best.predict(X_test_num)
MSE_num_test_las = mse(y_hat_num_test_las,y_test)
accuracy_num_test_las =  np.mean(y_test.round(1)==y_hat_num_test_las.round(1)) 

#Append MSE and accuracy to list
mse_las_test.append(MSE_tit_test_las)
mse_las_test.append(MSE_des_test_las)
mse_las_test.append(MSE_num_test_las)

accu_las_test.append(accuracy_tit_test_las)
accu_las_test.append(accuracy_des_test_las)
accu_las_test.append(accuracy_num_test_las)

print(accu_las_test, mse_las_test)

## Elastic net - test

In [123]:
#Title pipeline
pipe_tit_elas_best = make_pipeline(TfidfVectorizer(),
    ElasticNet(alpha = para_elas[0]['elasticnet__alpha'], 
               l1_ratio  = para_elas[0]['elasticnet__l1_ratio'], 
               random_state=1234)
                                  )

#Description pipeline
pipe_des_elas_best = make_pipeline(TfidfVectorizer(),
    ElasticNet(alpha = para_elas[1]['elasticnet__alpha'], 
               l1_ratio  = para_elas[1]['elasticnet__l1_ratio'], 
               random_state=1234)
                                  )

#Numerical pipeline
pipe_num_elas_best = make_pipeline(
    PolynomialFeatures(degree = para_elas[2]['polynomialfeatures__degree'], include_bias=True),
    StandardScaler(),
    ElasticNet(alpha = para_elas[2]['elasticnet__alpha'], 
               l1_ratio  = para_elas[2]['elasticnet__l1_ratio'], 
               random_state=1234)
                                    )

In [155]:
# Fit on training data
# Fit title words
pipe_tit_elas_best.fit(X_dev_tit, y_dev)

# Fit description words
pipe_des_elas_best.fit(X_dev_des, y_dev)

# Fit numerical values
pipe_num_elas_best.fit(X_dev_num, y_dev)

In [156]:
#Predict test data
accu_elas_test = []
mse_elas_test = []

# Words: 
y_hat_tit_test_elas = pipe_tit_elas_best.predict(X_test_tit)
MSE_tit_test_elas = mse(y_hat_tit_test_elas,y_test)
accuracy_tit_test_elas =  np.mean(y_test.round(1)==y_hat_tit_test_elas.round(1)) 

y_hat_des_test_elas = pipe_des_elas_best.predict(X_test_des)
MSE_des_test_elas = mse(y_hat_des_test_elas,y_test)
accuracy_des_test_elas =  np.mean(y_test.round(1)==y_hat_des_test_elas.round(1)) 

#numerical variables

y_hat_num_test_elas = pipe_num_elas_best.predict(X_test_num)
MSE_num_test_elas = mse(y_hat_num_test_elas,y_test)
accuracy_num_test_elas =  np.mean(y_test.round(1)==y_hat_num_test_elas.round(1)) 

#Append MSE and accuracy to list
mse_elas_test.append(MSE_tit_test_elas)
mse_elas_test.append(MSE_des_test_elas)
mse_elas_test.append(MSE_num_test_elas)

accu_elas_test.append(accuracy_tit_test_elas)
accu_elas_test.append(accuracy_des_test_elas)
accu_elas_test.append(accuracy_num_test_elas)

print(accu_elas_test, mse_elas_test)

## OLS - test

In [126]:
#Title pipeline
pipe_tit_ols_best = make_pipeline(TfidfVectorizer(),
                                LinearRegression())

#Description pipeline
pipe_des_ols_best = make_pipeline(TfidfVectorizer(),
                                LinearRegression())

#Numerical pipeline
pipe_num_ols_best = make_pipeline(
                            PolynomialFeatures(degree =para_ols['polynomialfeatures__degree'], include_bias=True),
                            StandardScaler(),
                            LinearRegression())

In [157]:
# Fit on training data
# Fit title words
pipe_tit_ols_best.fit(X_dev_tit, y_dev)

# Fit description words
pipe_des_ols_best.fit(X_dev_des, y_dev)

# Fit numerical values
pipe_num_ols_best.fit(X_dev_num, y_dev)

In [158]:
#Predict test data
accu_ols_test = []
mse_ols_test = []

# Words: 
y_hat_tit_test_ols = pipe_tit_ols_best.predict(X_test_tit)
MSE_tit_test_ols = mse(y_hat_tit_test_ols,y_test)
accuracy_tit_test_ols =  np.mean(y_test.round(1)==y_hat_tit_test_ols.round(1)) 

y_hat_des_test_ols = pipe_des_ols_best.predict(X_test_des)
MSE_des_test_ols = mse(y_hat_des_test_ols,y_test)
accuracy_des_test_ols =  np.mean(y_test.round(1)==y_hat_des_test_ols.round(1)) 

#numerical variables

y_hat_num_test_ols = pipe_num_ols_best.predict(X_test_num)
MSE_num_test_ols = mse(y_hat_num_test_ols,y_test)
accuracy_num_test_ols =  np.mean(y_test.round(1)==y_hat_num_test_ols.round(1)) 

#Append MSE and accuracy to list
mse_ols_test.append(MSE_tit_test_ols)
mse_ols_test.append(MSE_des_test_ols)
mse_ols_test.append(MSE_num_test_ols)

accu_ols_test.append(accuracy_tit_test_ols)
accu_ols_test.append(accuracy_des_test_ols)
accu_ols_test.append(accuracy_num_test_ols)

print(accu_ols_test, mse_ols_test)

## Learning curves

In [132]:
#Lists
x_data = [X_dev_num, X_dev_tit, X_dev_des]
Title = ['Numerical variables', 'Title', 'Description']

In [159]:
#Lasso
best_pipeline_lasso = [pipe_num_las_best, pipe_tit_las_best, pipe_des_las_best]

f_lasso, ax1 = plt.subplots(1, 3, figsize=(20,3))

for i, model1 in enumerate(best_pipeline_lasso):
    train_sizes, train_scores, test_scores = \
        learning_curve(estimator=model1,
                   X=x_data[i],
                   y=y_dev,
                   train_sizes=np.arange(0.05, 1.05, .05),
                   scoring='neg_mean_squared_error',                 
                   cv=3)
    
    ax1[i].fill_between(train_sizes, -test_scores.min(1), -test_scores.max(1), alpha=0.25, label ='Validation', color='blue')
    ax1[i].fill_between(train_sizes, -train_scores.min(1), -train_scores.max(1),  alpha=0.25, label='Train', color='red')

    ax1[i].set_title(f'Lasso - {Title[i]}')

ax1[0].set_ylabel('Mean squared error')

plt.savefig(fp_data /'lasso.png')

In [160]:
#Elastic net
best_pipeline_elastic = [pipe_num_elas_best, pipe_tit_elas_best, pipe_des_elas_best]

f_elas, ax2 = plt.subplots(1, 3, figsize=(20,3))

for j, model2 in enumerate(best_pipeline_elastic):
    train_sizes, train_scores, test_scores = \
        learning_curve(estimator=model2,
                   X=x_data[j],
                   y=y_dev,
                   train_sizes=np.arange(0.05, 1.05, .05),
                   scoring='neg_mean_squared_error',                 
                   cv=3)
    
    ax2[j].fill_between(train_sizes, -test_scores.min(1), -test_scores.max(1), alpha=0.25, label ='Validation', color='blue')
    ax2[j].fill_between(train_sizes, -train_scores.min(1), -train_scores.max(1),  alpha=0.25, label='Train', color='red')

    ax2[j].set_title(f'Elastic - {Title[j]}')

ax2[0].set_ylabel('Mean squared error')
ax2[0].set_ylim(-0.05,0.6)

plt.savefig(fp_data /'elastic.png')

In [161]:
#OLS
best_pipeline_ols = [pipe_num_ols_best, pipe_tit_ols_best, pipe_des_ols_best]

f_ols, ax3 = plt.subplots(1, 3, figsize=(20,3))

for k, model3 in enumerate(best_pipeline_ols):
    train_sizes, train_scores, test_scores = \
        learning_curve(estimator=model3,
                   X=x_data[k],
                   y=y_dev,
                   train_sizes=np.arange(0.05, 1.05, .05),
                   scoring='neg_mean_squared_error',                 
                   cv=3)
    
    ax3[k].fill_between(train_sizes, -test_scores.min(1), -test_scores.max(1), alpha=0.25, label ='Validation', color='blue')
    ax3[k].fill_between(train_sizes, -train_scores.min(1), -train_scores.max(1),  alpha=0.25, label='Train', color='red')

    ax3[k].set_title(f'OLS - {Title[k]}')


ax3[0].set_ylabel('Mean squared error')
ax3[1].set_ylim(-0.05,1.25)
ax3[0].legend();
ax3[0].set_ylim(-0.05,0.6)

plt.savefig(fp_data /'ols.png')

In [163]:
# create figure
fig_combined = plt.figure(figsize=(20, 10))
  
# setting values to rows and column variables
rows = 3
columns = 1
  
# reading images
lasso = fp_data /'lasso.png'
image_lasso = Image.open(lasso)

elastic = fp_data /'elastic.png'
image_elas = Image.open(elastic)

ols = fp_data /'ols.png'
image_ols = Image.open(ols)

# Adds a subplot at the 1st position (ols)
fig_combined.add_subplot(rows, columns, 1)
  
# showing image
plt.imshow(image_ols)
plt.axis('off')

# Adds a subplot at the 2nd position (lasso)
fig_combined.add_subplot(rows, columns, 2)
  
# showing image
plt.imshow(image_lasso)
plt.axis('off')

# Adds a subplot at the 3rd position (elas)
fig_combined.add_subplot(rows, columns, 3)
  
# showing image
plt.imshow(image_elas)
plt.axis('off')

plt.savefig(fp_data /'Learning curves.png')

## Validation curves

In [136]:
#Lists
x_data = [X_dev_num, X_dev_tit, X_dev_des]
Title = ['Numerical variables (1)', 'Title (2)', 'Description (3)']

In [164]:
#Lasso
best_pipeline_lasso = [pipe_num_las_best, pipe_tit_las_best, pipe_des_las_best]

results_lasso_df = pd.DataFrame()

for i, model in enumerate(best_pipeline_lasso):

    train_scores, test_scores = \
        validation_curve(estimator=model,
                     X=x_data[i],
                     y=y_dev,
                     param_name='lasso__alpha', #built-in name of hyperparameter
                     param_range=lambdas, #values to consider
                     scoring='neg_mean_squared_error',                 
                     cv=3)

    # OBTAIN MSE FOR DIFFERENT LAMBDAS AND PRINT BEST
    mse_score = pd.DataFrame({'lambda':lambdas,
                              'Train':-train_scores.mean(axis=1),
                              'Validation':-test_scores.mean(axis=1)})\
                              .set_index('lambda')
    
    results_lasso_df = pd.concat([results_lasso_df, mse_score], axis=1)
    print(mse_score['Validation'].nsmallest(1))
    lambda_smallest = mse_score['Validation'].nsmallest(1) 
    lam = lambda_smallest.index[0]  #finds the lamda that gives the smallet mse
    
    mse_score.plot(logx=True, figsize=(8,6));  #creates plot for training and validation data
    plt.axvline(x=lam, color='black', linestyle='--')  #creates the line that shows the optimal lambda 
    
    if i == len(best_pipeline_lasso) - 3:
        plt.ylabel("Mean squared error") #Add ylabel to the first figure
    
    if i == len(best_pipeline_lasso) - 1: 
        plt.legend(fontsize=15) # Add legend for the last figure
    else:
        plt.gca().get_legend().remove() # Remove legend for previous figures
    
    plt.savefig(fp_data /f'{Title[i]}.png')

In [166]:
#Polynomial features - lasso (num)
train_scores, test_scores = \
    validation_curve(estimator=pipe_num_las_best,
                     X=X_dev_num,
                     y=y_dev,
                     param_name='polynomialfeatures__degree', #built-in name of hyperparameter
                     param_range=degrees, #values to consider
                     scoring='neg_mean_squared_error',                 
                     cv=3)

# OBTAIN MSE FOR DIFFERENT LAMBDAS AND PRINT BEST
mse_score_num_las = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                          'Validation':-test_scores.mean(axis=1),
                          'poly':degrees})\
              .set_index('poly')   
print(mse_score_num_las.Validation.nsmallest(1))

mse_score_num_las

In [167]:
#Polynomial features - Elas (num)
train_scores, test_scores = \
    validation_curve(estimator=pipe_num_elas_best,
                     X=X_dev_num,
                     y=y_dev,
                     param_name='polynomialfeatures__degree', #built-in name of hyperparameter
                     param_range=degrees, #values to consider
                     scoring='neg_mean_squared_error',                 
                     cv=3)

# OBTAIN MSE FOR DIFFERENT LAMBDAS AND PRINT BEST
mse_score_num_elas = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                          'Validation':-test_scores.mean(axis=1),
                          'poly':degrees})\
              .set_index('poly')   
print(mse_score_num_elas.Validation.nsmallest(1))

mse_score_num_elas

In [168]:
#Polynomial features - OLS (num)
train_scores, test_scores = \
    validation_curve(estimator=pipe_num_ols_best,
                     X=X_dev_num,
                     y=y_dev,
                     param_name='polynomialfeatures__degree', #built-in name of hyperparameter
                     param_range=degrees, #values to consider
                     scoring='neg_mean_squared_error',                 
                     cv=3)

# OBTAIN MSE FOR DIFFERENT LAMBDAS AND PRINT BEST
mse_score_num_ols = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                          'Validation':-test_scores.mean(axis=1),
                          'poly':degrees})\
              .set_index('poly')   
print(mse_score_num_ols.Validation.nsmallest(1))

mse_score_num_ols