# Phase 5: Training the model
## Overview
In this section I will training the regression model. I will look at the RMSE for each degree of polynomial from 1-5 then use the degree with the lowest RMSE.

## Importing needed files and libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib

training_df = pd.read_csv('../processed_data/dataset_splits/training_set.csv')
validation_df = pd.read_csv('../processed_data/dataset_splits/validation_set.csv')
display(training_df.head())

Unnamed: 0,system_id,latitude,longitude,elevation_m,abs(azimuth - 180),tilt,performance_ratio,solar irradiance,temperature,cloud cover,wind speed,humidity,precipitation
0,10000,44.914573,-93.162525,288.79187,0.0,33.0,0.160721,3.743916,5.400575,63.48326,3.976438,3.027014,83.503699
1,10001,39.483937,-76.301594,51.222542,0.0,20.0,0.274085,4.093048,12.999041,63.736466,3.496603,2.970356,77.229479
2,10005,33.198982,-97.150581,207.639435,0.0,19.0,0.219475,4.70751,18.458411,50.966411,4.469096,2.365068,71.625863
3,10010,45.136089,-88.010933,202.512192,0.0,35.0,0.315326,3.561523,5.190986,66.488795,4.549726,3.327918,85.779178
4,10013,37.231378,-121.874925,66.506325,0.0,26.0,0.312527,5.223814,14.121178,35.566384,2.675452,2.020767,74.728548


## Running the model
Now we're going to run the model 5 times for each polynomial and calculate the RMSE.

In [2]:
# Sets the columns for the features and label for training
training_input = training_df.drop(columns=['performance_ratio'])
training_output = training_df['performance_ratio']

# Sets the columns for the features and label for validation
validation_input = validation_df.drop(columns=['performance_ratio'])
validation_output = validation_df['performance_ratio']

# The pipeline that will run the model
poly_model = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('linear', LinearRegression())
])


for i in range(1,6):
    # Sets the degree and fits the model with the training data
    poly_model.set_params(poly__degree=i)
    poly_model.fit(training_input, training_output)
    
    # Predicts the model output for the validation data
    predicted_validation_output = poly_model.predict(validation_input)
    
    rmse = np.sqrt(mean_squared_error(predicted_validation_output, validation_output))
    print(rmse)

0.058887900752506
0.06669721553355948
1.3838700174013474
2.320668098619966
1.1538859656163607


## Saving the model
We can see that a degree 1 model has the lowest RMSE therefore we will be using linear regression.

In [8]:
# Combines the validation and training data
combined_input = pd.concat([training_input, validation_input], axis=0, ignore_index=True)
combined_output = pd.concat([training_output, validation_output], axis=0, ignore_index=True)

# Sets the final model parameters and fits it to the combined data
poly_model.set_params(poly__degree=1)
poly_model.fit(combined_input,combined_output)

# Saves the model to be loaded later
model_file = '../models/old_model.joblib'
joblib.dump(poly_model, model_file)
print("Model saved")

Model saved
