# Phase 7: Model refinement
## Overview
In this final section I will refining my model.

In [15]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib
from sklearn.metrics import r2_score

system_df = pd.read_csv('../processed_data/useable_systems_metadata.csv')
display(system_df.head())

Unnamed: 0,system_id,latitude,longitude,elevation_m,azimuth,tilt,dc_capacity_kW,mean_power_generation_kW,performance_ratio
0,10000,44.914573,-93.162525,288.79187,180.0,33.0,5.85,0.940219,0.160721
1,10001,39.483937,-76.301594,51.222542,180.0,20.0,3.36,0.920926,0.274085
2,10003,40.346434,-76.423645,145.150772,180.0,35.0,11.04,3.251637,0.294532
3,10005,33.198982,-97.150581,207.639435,180.0,19.0,15.665,3.438072,0.219475
4,10010,45.136089,-88.010933,202.512192,180.0,35.0,12.192,3.844456,0.315326


# Gathering data
My main change for my model will be the data I put into it. Im going to structure the data on a day by day basis so each day will have all of the information about it for my model to run. Im only going to do 10 days of data per system as doing any more would take a long time to run and should be more than enough for the amount of data I have to not be a limiting factor

In [85]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Creates the df that will hold the dataa for each day
columns = ['system_id', 'latitude', 'longitude', 'elevation_m', 'abs(azimuth - 180)', 
           'tilt', 'performance_ratio', 'solar irradiance', 'temperature', 'cloud cover', 'wind speed', 'precipitation', 'humidity']
day_df = pd.DataFrame(columns=columns)

for index, row in system_df.iterrows():
    for i in range(10):
        # Gathers the system metadata the model needs
        sys_id = str(int(row['system_id']))
        lat = row['latitude']
        lon = row['longitude']
        elev = row['elevation_m']
        azi = abs(row['azimuth'] - 180)
        tilt = row['tilt']
        capacity = row['dc_capacity_kW']

        # Reads the solar generation data
        system_generation_df = pd.read_csv('../raw_data/system_generation_data/' + sys_id + '/1.csv')
        # Gets the date and year we're currenly looking at
        date = system_generation_df.iloc[i,0]
        year = date[0:4]
        # Calculates the performance ratio for that day
        performance_ratio = system_generation_df.iloc[i,2] / capacity

        # Gets the row with the weather data we need for that day
        system_generation_df = pd.read_csv('../raw_data/system_weather_data/' + sys_id + '/' + year + '.csv')
        row2 = system_generation_df[(system_generation_df["MO"] == int(date[5:7])) & (system_generation_df["DY"] == int(date[8:10]))]

        # Gathers the weather data for that day
        irrad = float(row2['ALLSKY_SFC_SW_DWN'])
        temp = float(row2['T2M'])
        cloud = float(row2['CLOUD_AMT'])
        wind = float(row2['WS10M'])
        prec = float(row2['PRECTOTCORR'])
        hum = float(row2['RH2M'])

        # Stores the data for that day
        day_df.loc[len(day_df)] = [sys_id, lat, lon, elev, azi, tilt, performance_ratio, irrad, temp, cloud, wind, prec, hum]

    if index % 100 == 0:
        print(index)

print(len(day_df))
display(day_df.head())

0
100
200
300
400
500
600
700
800
8920


Unnamed: 0,system_id,latitude,longitude,elevation_m,abs(azimuth - 180),tilt,performance_ratio,solar irradiance,temperature,cloud cover,wind speed,precipitation,humidity
0,10000,44.914573,-93.162525,288.79187,0.0,33.0,0.01528,0.9434,-6.19,67.36,3.41,0.0,86.18
1,10000,44.914573,-93.162525,288.79187,0.0,33.0,0.034736,1.117,-2.58,69.81,4.19,0.01,86.04
2,10000,44.914573,-93.162525,288.79187,0.0,33.0,0.0274,0.7454,-4.42,96.96,4.25,0.0,77.87
3,10000,44.914573,-93.162525,288.79187,0.0,33.0,0.030798,0.6259,-7.22,90.04,2.17,0.0,75.79
4,10000,44.914573,-93.162525,288.79187,0.0,33.0,0.070784,1.0205,-3.38,98.35,3.35,0.71,88.38


## Splitting the data
Now we split the data into test validation and training

In [86]:
# Splits the data into test validation and training
df_test = day_df.sample(frac=0.2, random_state=42).drop(columns=['system_id'])
df_remaining = day_df.drop(df_test.index).drop(columns=['system_id'])
df_validation = df_remaining.sample(frac=0.25, random_state=42)
df_train = df_remaining.drop(df_validation.index)

## Running the models on the validation data
Next im going to run the models on the validation data to find the best degree to use

In [87]:
# Sets the training and validation inputs and outputs
training_input = df_train.drop(columns=['performance_ratio'])
training_output = df_train['performance_ratio']

validation_input = df_validation.drop(columns=['performance_ratio'])
validation_output = df_validation['performance_ratio']

# The pipeline that will run the model
poly_model = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('linear', LinearRegression())
])

# Runs the model on the validation data from degrees 1-4 and outputs the rmse of each
for i in range(1, 5):
    poly_model.set_params(poly__degree=i)
    poly_model.fit(training_input, training_output)
    
    # Predicts the model output for the validation data
    predicted_validation_output = poly_model.predict(validation_input)

    rmse = np.sqrt(mean_squared_error(predicted_validation_output, validation_output))
    print(rmse)

0.09653880607466156
0.09363153340203928
0.09266162521241024
0.11361464084570838


## Testing the model
Now I'm going to run the model on the test data to find the new models r2.

In [88]:
# Combines the training and validation data
combined_input = pd.concat([training_input, validation_input], ignore_index=True)
combined_output = pd.concat([training_output, validation_output], ignore_index=True)

test_input = df_test.drop(columns=['performance_ratio'])
test_output = df_test['performance_ratio']

# Sets the model with degree 3 and fits it
poly_model.set_params(poly__degree=3)
poly_model.fit(combined_input, combined_output)

# Runs the model and finds r2
predicted_test_output = poly_model.predict(test_input)
r2 = r2_score(test_output, predicted_test_output)
print("The r^2 of the test: " + str(r2))

The r^2 of the test: 0.6151395528860413


## Conclusion
We've much improved the model going to 0.62. I've worked with quite alot of data for the final model so its likely that the amount of data is no longer the limiting factor in this models performance. Instead it may be that the model isn't complex enough or is missing some key variables in predicting solar production.