<a href="https://colab.research.google.com/github/AubreyFeldker/CS4375TeamProject/blob/main/Copy_of_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# Preprocessing
import time
import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

In [2]:
# Read in csv file
df = pd.read_csv("https://raw.githubusercontent.com/AubreyFeldker/CS4375TeamProject/main/Dallas-Fort%20Worth%20Metropolitan%20Area%20Weather%20Events%202016-2021.csv")

# Removing Time and Date Columns since Timestamp covers both
df = df.drop(['Time(UTC)', 'Date'], axis = 1)


In [3]:
# Using One-Hot Encoding for the precipitation type and severity
# converts categorical data into numerical data
oneHot = OneHotEncoder()

# Using two different arrays to avoid size mismatch
# Perform encoding
arry = oneHot.fit_transform(df[['Type']]).toarray()
feature_labels = oneHot.categories_

arry2= oneHot.fit_transform(df[['Severity']]).toarray()
feature_labels2 = oneHot.categories_

# Flatten out the arrays
feature_labels = np.array(feature_labels).ravel()
feature_labels2 = np.array(feature_labels2).ravel()

# Create new dataframes with the encoded arrays
df1 = pd.DataFrame(arry, columns = feature_labels)
df2 = pd.DataFrame(arry2, columns = feature_labels2)


# Drop type and severity columns to add in new encoded columns
df = df.drop(['Type','Severity'], axis = 1)

# Concat all columns from df, df1, and df2
df = pd.concat([df1, df2, df], axis=1)


# Renameing Precipitation column to avoid confusion
df.rename(columns = {"Precipitation":"Other"}, inplace = True)


In [4]:
# Standardize Precipitation
scaler = StandardScaler()
df[['Precipitation(in)']] = scaler.fit_transform(df[['Precipitation(in)']])


In [5]:
# Convert Python DateTime to Unix Timestamp
df['Timestamp'] = df['Timestamp'].apply(lambda x: datetime.datetime.strptime(str(x),'%Y%m%d%H%M').timestamp())


In [6]:
df.head()

Unnamed: 0,Cold,Fog,Hail,Other,Rain,Snow,Storm,Heavy,Light,Moderate,Other.1,Severe,UNK,Precipitation(in),Latitude,Longitude,Timestamp
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.260101,32.968606,-96.830041,1552286000.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.260101,32.968606,-96.830041,1452135000.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.260101,32.968606,-96.830041,1452974000.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.260101,32.968606,-96.830041,1453388000.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.260101,32.968606,-96.830041,1453396000.0


In [7]:
df = df.sort_values(by = 'Timestamp')
df = df.reset_index()

X = df.drop('Precipitation(in)', axis=1)
Y = df['Precipitation(in)']
split_ratio = 0.8
split_index = int(len(df) * split_ratio)
X_train = X[:split_index]
Y_train = Y[:split_index]
X_test = X[split_index:]
Y_test = Y[split_index:]

In [8]:
X_train

Unnamed: 0,index,Cold,Fog,Hail,Other,Rain,Snow,Storm,Heavy,Light,Moderate,Other.1,Severe,UNK,Latitude,Longitude,Timestamp
0,29837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,32.441944,-97.781389,1.451636e+09
1,40176,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,32.774237,-96.609069,1.451847e+09
2,40177,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,32.774237,-96.609069,1.451937e+09
3,1079,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,32.968606,-96.830041,1.452009e+09
4,32346,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,32.745496,-97.003529,1.452016e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45785,19101,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,33.216389,-97.129167,1.599703e+09
45786,57018,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,32.737500,-96.282500,1.599704e+09
45787,49796,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,32.931051,-96.458650,1.599704e+09
45788,39207,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,33.197963,-96.615024,1.599705e+09


In [9]:
print(Y_train.shape)

(45790,)


In [10]:
class RNN:
  def __init__(self, input_size, hidden_size, output_size, update_weight=.05):
      self.input_size = input_size
      self.hidden_size = hidden_size
      self.output_size = output_size
      self.update_weight = update_weight

      self.Weight_metrix_input_multi_hidden = np.random.randn(input_size, hidden_size) * update_weight
      self.Weight_metrix_hidden_multi_hidden = np.random.randn(hidden_size, hidden_size) * update_weight
      self.Weight_metrix_output_multi_hidden = np.random.randn(output_size, hidden_size) * update_weight
      
      self.bias_to_hidden = np.zeros((1, hidden_size))
      self.bias_to_output = np.zeros((1, output_size))

  def tanh(self, x):
      return np.tanh(x)
  def tanh_derivative(self, x):
      return 1 - self.tanh(x) ** 2


  def forward_propagation(self, input_sequence):
    hidden_states = {}
    outputs = {}
    for i in range(len(input_sequence)):
      current_time_step_for_input_sequence = input_sequence.iloc[i].to_numpy().reshape(1, -1)

      if i == 0:
        hidden = self.tanh(current_time_step_for_input_sequence.dot(self.Weight_metrix_input_multi_hidden) + self.bias_to_hidden)
      else:
        hidden = self.tanh(current_time_step_for_input_sequence.dot(self.Weight_metrix_input_multi_hidden) + hidden_states[i-1].dot(self.Weight_metrix_hidden_multi_hidden) + self.bias_to_hidden)
      hidden_states[i] = hidden
      output = self.tanh(hidden.dot(self.Weight_metrix_output_multi_hidden.T) + self.bias_to_output)
      outputs[i] = output

      
    return {"outputs": outputs, "hidden_states": hidden_states}


  def current_time_step_hidden_state_error_method(self, Weight_matrix_output_hidden, Weight_matrix_hidden_hidden, current_time_step_output_error, next_time_step_hidden_state_error, cache_hidden_states_i):
    error_from_output_layer = np.dot(Weight_matrix_output_hidden.T, current_time_step_output_error)
    
    if next_time_step_hidden_state_error.shape == (1, Weight_matrix_hidden_hidden.shape[0]):
        next_time_step_hidden_state_error = next_time_step_hidden_state_error.T

    error_from_next_time_step_hidden_state = np.dot(Weight_matrix_hidden_hidden.T, next_time_step_hidden_state_error.reshape(-1, 1))

    combined_error = error_from_output_layer.T + error_from_next_time_step_hidden_state.T
    print(error_from_output_layer.shape)
    print(error_from_next_time_step_hidden_state.shape)
    hidden_state_derivative = self.tanh_derivative(cache_hidden_states_i)

    current_hidden_state_error = combined_error * hidden_state_derivative

    return current_hidden_state_error


  def backward_propagation(self, input_sequence, target_sequence, cache):
    gradient = [np.zeros_like(self.Weight_metrix_input_multi_hidden),  
                np.zeros_like(self.Weight_metrix_hidden_multi_hidden),  
                np.zeros_like(self.Weight_metrix_output_multi_hidden),  
                np.zeros_like(self.bias_to_hidden),  
                np.zeros_like(self.bias_to_output)]   
    next_time_step_hidden_state_error = np.zeros_like(self.bias_to_hidden)
    for i in range(len(input_sequence)-1, -1, -1):
      current_time_step_output_error = (cache["outputs"][i] - target_sequence.iloc[i]) * self.tanh_derivative(cache["outputs"][i])
      current_time_step_gradient_output_to_hidden = current_time_step_output_error.dot(cache["hidden_states"][i])
      gradient[2] += current_time_step_gradient_output_to_hidden
      gradient[4] += np.sum(current_time_step_output_error, axis=1).reshape(-1, 1)
      current_time_step_hidden_state_error = self.current_time_step_hidden_state_error_method(self.Weight_metrix_output_multi_hidden, self.Weight_metrix_hidden_multi_hidden, current_time_step_output_error, next_time_step_hidden_state_error, cache["hidden_states"][i] )
      if i == 0:
        previous_hidden_state = np.zeros_like(self.bias_to_hidden)
      else:
        previous_hidden_state = cache["hidden_states"][i-1]
      current_time_step_gradient_hidden_to_hidden = current_time_step_hidden_state_error.dot(previous_hidden_state.T)
      repeated_input_sequence = input_sequence.iloc[i].to_numpy()
      print(current_time_step_hidden_state_error.T.shape)
      print(input_sequence.iloc[i].to_numpy().reshape(1, -1).T.shape)
      current_time_step_gradient_input_to_hidden = np.dot(current_time_step_hidden_state_error.T, (input_sequence.iloc[i].to_numpy().reshape(1, -1))).T
      gradient[1] += current_time_step_gradient_hidden_to_hidden
      gradient[0] += current_time_step_gradient_input_to_hidden
      gradient[3] += current_time_step_hidden_state_error
      next_time_step_hidden_state_error = current_time_step_hidden_state_error
    return gradient
  def update_weights_and_biases(self, gradients):
    self.Weight_metrix_input_multi_hidden -= self.update_weight * gradients[0]
    self.Weight_metrix_hidden_multi_hidden -= self.update_weight * gradients[1]
    self.Weight_metrix_output_multi_hidden -= self.update_weight * gradients[2]
    self.bias_to_hidden -= self.update_weight * gradients[3]
    self.bias_to_output -= self.update_weight * gradients[4]

  def train(self, training_data, target_data, epochs, sequence_length):
    assert len(training_data) == len(target_data), "Training data and target data should have the same length."
   
    for epoch in range(epochs):
        epoch_loss = 0
        for i in range(0, len(training_data) - sequence_length + 1):
            input_sequence = training_data[i : i + sequence_length]
            target_sequence = target_data[i : i + sequence_length]
            cache = self.forward_propagation(input_sequence)
            sequence_loss = 0
            for j in range(sequence_length):
                sequence_loss += np.square(cache["outputs"][j] - target_sequence.iloc[j])
            sequence_loss /= sequence_length
            epoch_loss += sequence_loss
            gradients = self.backward_propagation(input_sequence, target_sequence, cache)
            self.update_weights_and_biases(gradients)
        epoch_loss /= (len(training_data) - sequence_length + 1)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")
  def predict(self, input_sequence):
    hidden_states = {}
    outputs = {}
    predictions = []
    for i in range(len(input_sequence)):
        current_time_step_for_input_sequence = input_sequence[i].reshape(-1, 1)
        if i == 0:
            hidden = self.tanh(current_time_step_for_input_sequence.dot(self.Weight_metrix_input_multi_hidden) + self.bias_to_hidden)
        else:
            hidden = self.tanh(current_time_step_for_input_sequence.dot(self.Weight_metrix_input_multi_hidden) + hidden_states[i-1].dot(self.Weight_matrix_hidden_multi_hidden) + self.bias_to_hidden)
        hidden_states[i] = hidden
        output = self.tanh(hidden.dot(self.Weight_metrix_hidden_multi_output) + self.bias_to_output)
        outputs[i] = output
        predictions.append(output)
    predictions = np.array(predictions).reshape(len(input_sequence), -1)
    return predictions



Check the dimensions of the arrays

Make sure that the dimensions of the arrays are consistent with the expected shapes.

In [None]:
rnn = RNN(17, 20, 1)
rnn.train(X_train, Y_train, 10, 50)


(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)


  gradient[0] += current_time_step_gradient_input_to_hidden
  gradient[1] += current_time_step_gradient_hidden_to_hidden
  gradient[3] += current_time_step_hidden_state_error


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)
(20, 1)
(17, 1)
(20, 1)
(20, 1)

In [None]:
predictions = rnn.predict(X_test)
test_loss = rnn.loss_function(predictions, Y_test)
print(f"Test Loss: {test_loss}")