In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv("seattle-weather.csv")

# Display the first 5 rows of the DataFrame
display(df.head())

# Print the concise summary of the DataFrame
df.info()

# Display descriptive statistics of the DataFrame
display(df.describe())

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


## Data Preprocessing

In [8]:
df['date'] = pd.to_datetime(df['date'])
numerical_features = ['precipitation', 'temp_max', 'temp_min', 'wind']
df_numerical = df[numerical_features]

In [9]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_numerical)

def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

sequence_length = 10 # Define the sequence length
X, y = create_sequences(df_scaled, sequence_length)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Creation

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(units=50, return_sequences=False, input_shape=(sequence_length, df_numerical.shape[1])))
model.add(Dense(units=df_numerical.shape[1]))

model.summary()

  super().__init__(**kwargs)


## Model compilation


Compile the RNN model by specifying the optimizer, loss function, and metrics.


In [12]:
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mean_absolute_error'])

## Model training


Train the RNN model using the prepared data, splitting it into training and validation sets.


In [13]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.0882 - mean_absolute_error: 0.2088 - val_loss: 0.0198 - val_mean_absolute_error: 0.1129
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0173 - mean_absolute_error: 0.0969 - val_loss: 0.0150 - val_mean_absolute_error: 0.0942
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0151 - mean_absolute_error: 0.0895 - val_loss: 0.0132 - val_mean_absolute_error: 0.0835
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0144 - mean_absolute_error: 0.0867 - val_loss: 0.0129 - val_mean_absolute_error: 0.0816
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0141 - mean_absolute_error: 0.0855 - val_loss: 0.0129 - val_mean_absolute_error: 0.0825
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - los

## Model evaluation




In [14]:
evaluation_results = model.evaluate(X_val, y_val)
print("Validation Loss:", evaluation_results[0])
print("Validation Mean Absolute Error:", evaluation_results[1])

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0102 - mean_absolute_error: 0.0718 
Validation Loss: 0.01024629920721054
Validation Mean Absolute Error: 0.07183320075273514




### Data Analysis Key Findings

*   The initial attempt to load "data/GOOG.csv" failed; the analysis proceeded using "seattle-weather.csv".
*   The "seattle-weather.csv" dataset contains 1461 entries and 6 columns, with no missing values.
*   Numerical features were scaled using `MinMaxScaler` and converted into sequences of length 10.
*   The sequential data was split into 80% for training and 20% for validation.
*   An RNN model with an LSTM layer (50 units) and a Dense output layer (4 units) was defined.
*   The model was compiled using the Adam optimizer, Mean Squared Error loss, and Mean Absolute Error metric.
*   The model was trained for 50 epochs, showing decreasing loss and MAE on both training and validation sets.
*   On the validation set, the model achieved a Validation Loss of 0.0108 and a Validation Mean Absolute Error of 0.0731.
*   Predictions were made on the validation set and inverse transformed back to the original scale.



## RNN Based Model

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential()
model.add(SimpleRNN(
    units=50,
    return_sequences=False,
    input_shape=(sequence_length, df_numerical.shape[1])
))
model.add(Dense(df_numerical.shape[1]))

model.compile(optimizer='adam', loss='mse')
model.summary()


In [25]:
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mean_absolute_error'])

In [26]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/50


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0744 - mean_absolute_error: 0.1880 - val_loss: 0.0246 - val_mean_absolute_error: 0.1192
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0229 - mean_absolute_error: 0.1133 - val_loss: 0.0184 - val_mean_absolute_error: 0.1018
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0190 - mean_absolute_error: 0.1019 - val_loss: 0.0172 - val_mean_absolute_error: 0.0973
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0167 - mean_absolute_error: 0.0950 - val_loss: 0.0146 - val_mean_absolute_error: 0.0896
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0154 - mean_absolute_error: 0.0908 - val_loss: 0.0140 - val_mean_absolute_error: 0.0889
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0147 - 

In [27]:
evaluation_results = model.evaluate(X_val, y_val)
print("Validation Loss:", evaluation_results[0])
print("Validation Mean Absolute Error:", evaluation_results[1])

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0104 - mean_absolute_error: 0.0729 
Validation Loss: 0.01041257381439209
Validation Mean Absolute Error: 0.0728963315486908
