# Import Statements



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics

# Data Preprocessing

In [2]:
# data Loading
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/nevada_weather.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# dropping the year column
df.drop("Year",axis=1, inplace=True)

In [4]:
# mapping isDay (false/true) -> (0/1)
df.replace(to_replace=True, value=1, inplace=True)
df.replace(to_replace=False, value=0, inplace=True)

In [5]:
# dropping columns (refer preprocesssing notebook for details)
df = df.drop(["DHI", "Clearsky DHI", "Clearsky DNI", "Clearsky GHI", "DNI", "Fill Flag", "Dew Point"], axis = 1)

# Model Spilting

In [6]:
# Mapping: features (current time) -> GHI value (48 hours into the future)
total_records = len(df)
features = df[:total_records-288].drop('GHI', axis=1).reset_index()
output = df['GHI'][288:].reset_index()
df = pd.concat([features, output], join='inner', axis=1)
df.drop(['index'], axis=1, inplace=True)

In [7]:
# splitting the dataset into two: day and night
df_day = df[df.is_day == 1].drop('is_day', axis=1)
df_night = df[df.is_day == 0].drop('is_day', axis=1)

In [8]:
def feature_output_split(x):
  '''
  split the data into features (weather observations) and output (GHI) sets
  x: initial dataset
  returns: two datasets (feature & output)
  '''
  
  total_records = len(x)
  features = x.drop('GHI', axis=1)
  output = x['GHI']

  return features, output

In [9]:
# feature-output split for the whole data (df), day data (df_day) & night data (df_night)
feature_df, GHI = feature_output_split(df)
day_feature_df, day_GHI = feature_output_split(df_day)
night_feature_df, night_GHI = feature_output_split(df_night)

In [10]:
# train-test split for the whole data (df), day data (df_day) & night data (df_night)
x_train, x_test, y_train, y_test = train_test_split(feature_df, GHI, test_size=.20, random_state=42)
day_x_train, day_x_test, day_y_train, day_y_test = train_test_split(day_feature_df, day_GHI, test_size=.20, random_state=42)
night_x_train, night_x_test, night_y_train, night_y_test = train_test_split(night_feature_df, night_GHI, test_size=.20, random_state=42)

# Polynomial Regression

In [16]:
def combine_result(y1, y1_pred, y2, y2_pred):
  combined_y_pred = np.concatenate([y1_pred, y2_pred])
  combined_y = np.concatenate([y1, y2])
  rmse = metrics.mean_squared_error(combined_y, combined_y_pred) ** 0.5
  return rmse

In [11]:
def apply_polynomial_regression(x_train, y_train, x_test, y_test, n, filename):
  '''
  applies polynomial regression on the train data (x_train, y_train),
  and uses the same to print the train and test RMSE
  
  it also saves the model in pickel file ('filename')
  '''

  # polynomial expansion
  poly_reg = PolynomialFeatures(degree= n)
  poly_x_train = poly_reg.fit_transform(x_train) 
  poly_x_test = poly_reg.fit_transform(x_test)
  
  # training
  linear = LinearRegression()
  linear.fit(poly_x_train, y_train)
  # save the model to disk
  pickle.dump(linear, open("../Weights/" + filename, 'wb'))

  print("n= ", n)
  y_train_pred = linear.predict(poly_x_train)
  train_rmse = metrics.mean_squared_error(y_train, y_train_pred) ** 0.5
  print("Train RMSE: ", train_rmse)
  y_test_pred = linear.predict(poly_x_test)
  test_rmse = metrics.mean_squared_error(y_test, y_test_pred) ** 0.5
  print("Test RMSE: ", test_rmse)

In [12]:
# whole data
for n in [2, 3, 4]:
  apply_polynomial_regression(x_train, y_train, x_test, y_test, n, 'Polynomial/whole_' + str(n) + '.pkl')

n=  2
Train RMSE:  78.94815875127165
Test RMSE:  79.76316347309206
n=  3
Train RMSE:  69.4886579999666
Test RMSE:  70.52791023793802
n=  4
Train RMSE:  51.97700864293026
Test RMSE:  55.991477662151105


In [13]:
# day_time_data
for n in [2, 3, 4]:
  apply_polynomial_regression(day_x_train, day_y_train, day_x_test, day_y_test, n, 'Polynomial/day_' + str(n) + '.pkl')

n=  2
Train RMSE:  105.7743335551083
Test RMSE:  107.38763474081033
n=  3
Train RMSE:  88.52632167289764
Test RMSE:  91.64781683677313
n=  4
Train RMSE:  63.07592683607227
Test RMSE:  77.7846995811444


In [14]:
# night_time_data
for n in [2, 3, 4]:
  apply_polynomial_regression(night_x_train, night_y_train, night_x_test, night_y_test, n, 'Polynomial/night_' + str(n) + '.pkl')

n=  2
Train RMSE:  5.629558596699495
Test RMSE:  5.547973314118362
n=  3
Train RMSE:  4.524679098715154
Test RMSE:  4.682002823672813
n=  4
Train RMSE:  3.01223034291004
Test RMSE:  3.7496581410984495


In [17]:
# combining day and night time predictions

# day data: polynomial expansion
day_poly_reg = PolynomialFeatures(degree= 4)
poly_day_x_train = day_poly_reg.fit_transform(day_x_train) 
poly_day_x_test = day_poly_reg.fit_transform(day_x_test)

# night data: polynomial expansion
night_poly_reg = PolynomialFeatures(degree= 4)
poly_night_x_train = night_poly_reg.fit_transform(night_x_train) 
poly_night_x_test = night_poly_reg.fit_transform(night_x_test)

# training the models
day_linear = LinearRegression()
day_linear.fit(poly_day_x_train, day_y_train)
night_linear = LinearRegression()
night_linear.fit(poly_night_x_train, night_y_train)

# train RMSE
day_y_train_pred = day_linear.predict(poly_day_x_train)
night_y_train_pred = night_linear.predict(poly_night_x_train)
train_rmse = combine_result(day_y_train, day_y_train_pred, night_y_train, night_y_train_pred)
print("Train RMSE: ", train_rmse)

# test RMSE
day_y_test_pred = day_linear.predict(poly_day_x_test)
night_y_test_pred = night_linear.predict(poly_night_x_test)
test_rmse = combine_result(day_y_test, day_y_test_pred, night_y_test, night_y_test_pred)
print("Test RMSE: ", test_rmse)

Train RMSE:  45.107521105826294
Test RMSE:  55.626674792837434
