In [178]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import pathlib as pl
import os 
import pandas as pd

# Metrics for Model Selection

In this notebook you will fit polynomials to data to decide which order of polynomial is the best fit. Unlike before, the data you will be using is 3 dimensional, meaning it isn't possible to plot. Instead, you will write functions to calculate various metrics that are used to determine model fit. 

Complete this notebook, then answer the questions that go along side it. 

In [179]:
# set random seed for reproducibility
seed = 2022
np.random.seed(seed)

## Load the data 

In [180]:
import pathlib as pl
import os 

csv_path = pl.Path(os.getcwd()) / 'M6_Performance_Metrics_Data.csv'

with open(csv_path, 'rb') as file:
    data = pd.read_csv(file)

In [181]:
print(data.head())

   Unnamed: 0        x1        x2        x3          y
0           0  0.382303 -1.596593  1.233776   4.935364
1           1  1.902436  1.579109 -0.341741  25.138660
2           2 -1.689244  1.298489 -1.472081  -4.786340
3           3 -1.510509  1.937616 -1.600244  -3.185759
4           4  1.621717  0.515558 -1.869644  19.712731


## Section 1 : Split the data into training, validation and test sets

### TO DO: write a function that splits the data into traning, validation and test sets.

The function should take as inputs the dataframe and the percentage splits for each of training, validation and test. It should output 3 dataframes, one for each of the sets. 

In [182]:


##### ANSWER ######

def split_data(df, data_split):
    """function to divide a dataframe into training, validation and test dataframes
    :param df: the full dataframe which is to be divided 
    :param data_split: a list containing the fraction of the full dataframe for each
    of training, validation and test, in that order
    :return training, validation, test: dataframes for each of the sets"""
    
    training = df.sample(frac=data_split[0])
    validation = df.drop(training.index).sample(frac=data_split[1]*len(df)/len(df.drop(training.index)))
    test = df.drop(validation.index).drop(training.index)
    
    return training, validation, test

### TO DO: Use your function to split the data so the training set has 40% of the data and the validation and test sets have 30% of the data each

In [183]:
#### write your code here ####
split_percentage = [0.4, 0.3, 0.3]
training_df , validation_df, test_df = split_data(data, split_percentage)
training_df.head(4)

Unnamed: 0.1,Unnamed: 0,x1,x2,x3,y
79,79,-1.386593,1.158374,1.994851,12.563918
76,76,1.790229,0.389979,1.15769,9.563458
83,83,-1.893098,0.440954,-0.797749,-2.209804
5,5,-1.928868,-1.475115,-0.677217,24.007974


## Section 2: Write Metrics Functions 

### TO DO: Write the functions that calcluate the metrics you will use to evaluate the model fits

Write Functions that return:
- The mean absolute error
- The average error
- The mean absolute percentage error 
- The root mean squared error 
- The total sum of squared errors 

In [184]:
import numpy as np

def get_metrics(predicted_y, y):
    # Ensure y and predicted_y are 1D arrays for consistent calculations
    predicted_y = predicted_y.ravel()
    y = y.ravel()

    # Mean Absolute Error (MAE)
    Mean_Absolute_Error = np.mean(np.abs(predicted_y - y))
    
    # Average Error
    Average_Error = np.mean(predicted_y - y)
    
    # Mean Absolute Percentage Error (MAPE)
    epsilon = 1e-10  # Small value to prevent division by zero
    Mean_Absolute_Percentage_Error = np.mean(np.abs((y - predicted_y) / (y + epsilon))) * 100
    
    # Root Mean Squared Error (RMSE)
    Root_Mean_Squared_Error = np.sqrt(np.mean((predicted_y - y) ** 2))
    
    # Total Sum of Squared Errors (SSE)
    Total_Sum_of_Squared_Errors = np.sum((predicted_y - y) ** 2)
    
    # Return all metrics as a tuple
    return (Mean_Absolute_Error, Average_Error, Mean_Absolute_Percentage_Error, 
            Root_Mean_Squared_Error, Total_Sum_of_Squared_Errors)


## Section 3: Fit models to training data and calculate performance metric on validation sets

For polynomials of order 1, 2, 3, and 4, you will use fit_model to fit each each model. This function uses scikit-learn polynomial regression. 


### TODO: write function to convert dataframe into numpy arrays

The scikit-learn functions take numpy arrays as their inputs. Therefore before you can fit any data you need to write a function to turn a dataframe with columns [x1, x2, x3, y] into two numpy arrays: X and y. X should have dimensions (N, D), where N is the number of data points and D is the dimensionality of the data (in this case 3). y should have dimensions (N, ). 


In [185]:
# The below function it will give us a function that has the abiity to create a skit-learn PIPELINE OBJECT 
# that has the ability to return a object that is able to predict Y values based on the trained model 
# So this gets X, Y, and order of regression and can calculate the coeficients, and use those coeficents(within the obj)
# to return the predicted Y values.


def fit_model(X, y, order):
    """creates scikit-learn regression object and fits it to the X and y data"""
    model = Pipeline([('poly', PolynomialFeatures(degree=order)),
                      ('linear', LinearRegression(fit_intercept=False))])
    model = model.fit(X, y)
    return model 

#orders = range(1,7)
#predicted_y = {}

#for i in orders:
#    model = fit_model(data["X"], data["y"], i )
 #   predicted_y[f"Order {i}"] = model.predict(data["X"])



In [186]:
### write your function here ## 

def df_to_numpy(df):
    
    X = df[['x1','x2', 'x3']].values
    Y = df['y'].values
    return X, Y


### TO DO: For polynomials of order 1 to 6 inclusive: 
1. Fit a polynomial to the training data using the fit_model function 
2. Use model.predict(X) to get the model predictions on the validation set
3. Store the model in a dictionary of models where the keys indicate the order and the items are the models
4. Store the predictions in a seperate dictionary where the keys indicate the order and the items are numpy arrays of the predictions 

In [187]:
## write your code here ##
models = {}
predictions_validation = {}
X , Y = df_to_numpy(training_df)
X_validation , Y_validation = df_to_numpy(validation_df)

orders = range(1, 7)

for i in orders:
    model = fit_model(X, Y, i)
    models[i] = model
    predictions_validation[f"Order {i}"] = model.predict(X_validation)
    
#models, predictions_validation

## Section 4: Calculate metrics for each of the models

Now we want to calculate the metrics for each of the models. 


### TODO: Use the dictionary of predictions you have to caluclate and record (could be in a dataframe, or you could plot it on a graph) each of the metrics. 
1. Calculate each of the metrics for the model using the functions you wrote before
2. Store the metrics in a dataframe, with one row for each model or plot on a graph
3. Answer the questions that go alongside this notebook 

HINT: you can write a list of functions of the form:

methods = [RMSE, average_error, mean_abs_percent_error, total_sum_squared_error]

which you can then iterate over using a for loop. 



In [188]:
## write your code here ##
method_names = ['Mean_Absolute_Error', 'Average Error', 'MAPE', 'Root_Mean_Squared_Error',"Total_Sum_of_Squared_Errors"]

metrics_validation = pd.DataFrame(columns=["Degree"] + method_names)

for order, prediction_y in predictions_validation.items():
    all_errors = get_metrics(prediction_y, Y_validation )
    metrics_validation.loc[len(metrics_validation)] = [order] + list(all_errors)

metrics_validation

Unnamed: 0,Degree,Mean_Absolute_Error,Average Error,MAPE,Root_Mean_Squared_Error,Total_Sum_of_Squared_Errors
0,Order 1,5.101362,-1.203952,77.700468,7.683525,1771.096614
1,Order 2,2.62488,0.389437,56.927005,3.281847,323.115638
2,Order 3,4.805084,0.765551,86.271045,6.404143,1230.391453
3,Order 4,1.05013,-0.327957,15.689244,1.77972,95.022071
4,Order 5,1.799387,-0.123646,25.505965,2.955975,262.133689
5,Order 6,3.38555,1.603101,33.060392,8.298479,2065.942623


## Section 5: Use the test set to evaluate the performance of your chosen model

### TODO: For your selected model, calculate the RMSE, Average Error and Mean Absolute Percentage Error of the test data

In [189]:
## write your code here ## 
# I WILL CHOOSE MODEL 4 AS THE BEST FIT 

In [190]:
# Convert test DataFrame to NumPy arrays for X and y
X_Test, Y_Test = df_to_numpy(test_df)

metrics_df_test = pd.DataFrame(columns=["Degree"] + method_names)

degree = 4
model = models[degree]

y_prediction_test = model.predict(X_Test)

error_results_test = get_metrics(y_prediction_test, Y_Test)
metrics_df_test.loc[0] = [degree] + list(error_results_test)

# Display the DataFrame with metrics for test data

metrics_df_test

Unnamed: 0,Degree,Mean_Absolute_Error,Average Error,MAPE,Root_Mean_Squared_Error,Total_Sum_of_Squared_Errors
0,4.0,1.375387,0.039151,39.253005,2.447385,179.690746


print(training_df.describe())
print(validation_df.describe())
print(test_df.describe())
![image.png](attachment:image.png)