In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("weatherAUS.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261334 entries, 0 to 261333
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           261334 non-null  object 
 1   Location       261334 non-null  object 
 2   MinTemp        256726 non-null  float64
 3   MaxTemp        256794 non-null  float64
 4   Rainfall       253338 non-null  float64
 5   Evaporation    112128 non-null  float64
 6   Sunshine       101531 non-null  float64
 7   WindGustDir    241969 non-null  object 
 8   WindGustSpeed  242163 non-null  float64
 9   WindDir9am     240965 non-null  object 
 10  WindDir3pm     249740 non-null  object 
 11  WindSpeed9am   255841 non-null  float64
 12  WindSpeed3pm   251389 non-null  float64
 13  Humidity9am    255245 non-null  float64
 14  Humidity3pm    250305 non-null  float64
 15  Pressure9am    232515 non-null  float64
 16  Pressure3pm    232539 non-null  float64
 17  Cloud9am       142779 non-nul

In [4]:
df.isnull().sum()

Date                  0
Location              0
MinTemp            4608
MaxTemp            4540
Rainfall           7996
Evaporation      149206
Sunshine         159803
WindGustDir       19365
WindGustSpeed     19171
WindDir9am        20369
WindDir3pm        11594
WindSpeed9am       5493
WindSpeed3pm       9945
Humidity9am        6089
Humidity3pm       11029
Pressure9am       28819
Pressure3pm       28795
Cloud9am         118555
Cloud3pm         124625
Temp9am            4597
Temp3pm            9649
RainToday          7996
RISK_MM            7997
RainTomorrow       7997
dtype: int64

In [5]:
# Drop columns with too many missing values
# Since these columns contains missing values more that 8% of the total data it is better to drop them.
cols_to_drop = ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
df.drop(columns=cols_to_drop, inplace=True)



In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261334 entries, 0 to 261333
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           261334 non-null  object 
 1   Location       261334 non-null  object 
 2   MinTemp        256726 non-null  float64
 3   MaxTemp        256794 non-null  float64
 4   Rainfall       253338 non-null  float64
 5   WindGustDir    241969 non-null  object 
 6   WindGustSpeed  242163 non-null  float64
 7   WindDir9am     240965 non-null  object 
 8   WindDir3pm     249740 non-null  object 
 9   WindSpeed9am   255841 non-null  float64
 10  WindSpeed3pm   251389 non-null  float64
 11  Humidity9am    255245 non-null  float64
 12  Humidity3pm    250305 non-null  float64
 13  Pressure9am    232515 non-null  float64
 14  Pressure3pm    232539 non-null  float64
 15  Temp9am        256737 non-null  float64
 16  Temp3pm        251685 non-null  float64
 17  RainToday      253338 non-nul

In [7]:
# Dropping raintommorow rows with null value as it will be our target.
df=df.dropna(subset=["RainTomorrow"])

In [8]:
df.shape

(253337, 20)

In [9]:
# Imputing Missing Values in other columns

# These are the columns to be filled with mean
num_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed',
            'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
            'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RISK_MM'] 

for col in num_cols:
    df[col].fillna(df[col].mean(), inplace=True)

# Columns to be filled with mode
cat_cols = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [10]:
df.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RISK_MM          0
RainTomorrow     0
dtype: int64

In [11]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,No,0.2,No


In [12]:
# Label Encoding 
from sklearn.preprocessing import LabelEncoder

label_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']

le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])



In [13]:
# Binary encoding (Manual methode)
df['RainToday'] = df['RainToday'].map({'No': 0, 'Yes': 1})
df['RainTomorrow'] = df['RainTomorrow'].map({'No': 0, 'Yes': 1})

In [14]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,2,13.4,22.9,0.6,13,44.0,13,14,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,0.0,0
1,2008-12-02,2,7.4,25.1,0.0,14,44.0,6,15,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,0.0,0
2,2008-12-03,2,12.9,25.7,0.0,15,46.0,13,15,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,0.0,0
3,2008-12-04,2,9.2,28.0,0.0,4,24.0,9,0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,1.0,0
4,2008-12-05,2,17.5,32.3,1.0,13,41.0,1,7,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,0.2,0


In [15]:
#Date 
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

df.drop(columns=['Date'], inplace=True)



In [16]:
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,...,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow,Year,Month,Day
0,2,13.4,22.9,0.6,13,44.0,13,14,20.0,24.0,...,1007.7,1007.1,16.9,21.8,0,0.0,0,2008,12,1
1,2,7.4,25.1,0.0,14,44.0,6,15,4.0,22.0,...,1010.6,1007.8,17.2,24.3,0,0.0,0,2008,12,2
2,2,12.9,25.7,0.0,15,46.0,13,15,19.0,26.0,...,1007.6,1008.7,21.0,23.2,0,0.0,0,2008,12,3
3,2,9.2,28.0,0.0,4,24.0,9,0,11.0,9.0,...,1017.6,1012.8,18.1,26.5,0,1.0,0,2008,12,4
4,2,17.5,32.3,1.0,13,41.0,1,7,7.0,20.0,...,1010.8,1006.0,17.8,29.7,0,0.2,0,2008,12,5


In [17]:

corr_matrix = df.corr()

threshold = 0.60

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_features = [column for column in upper.columns if any(upper[column] > threshold)]
print("Highly correlated features:", high_corr_features)


Highly correlated features: ['MaxTemp', 'WindSpeed3pm', 'Humidity3pm', 'Pressure3pm', 'Temp9am', 'Temp3pm']


In [18]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

scaler = MinMaxScaler(feature_range=(0, 1))
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

pca = PCA(n_components=0.95) 
X_pca = pca.fit_transform(X)

X = X_pca  
y = y.values

def create_sequences(X, y, sequence_length=5):
    sequences = []
    targets = []

    for i in range(len(X) - sequence_length):
        sequences.append(X[i:i + sequence_length])
        targets.append(y[i + sequence_length]) 

    return np.array(sequences), np.array(targets)

X_seq, y_seq = create_sequences(X, y, sequence_length=5)

print("Reshaped X sequence shape:", X_seq.shape)
print("Reshaped y sequence shape:", y_seq.shape)


Reshaped X sequence shape: (253332, 5, 11)
Reshaped y sequence shape: (253332,)


In [19]:
split_ratio = 0.8
split_index = int(len(X_seq) * split_ratio)

X_train, X_test = X_seq[:split_index], X_seq[split_index:]
y_train, y_test = y_seq[:split_index], y_seq[split_index:]

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Train set shape: (202665, 5, 11)
Test set shape: (50667, 5, 11)


In [20]:
# Making Module
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.initializers import GlorotUniform

model = Sequential()
model.add(SimpleRNN(units=50, activation='relu', 
                    input_shape=(X_train.shape[1], X_train.shape[2]), 
                    kernel_initializer=GlorotUniform()))  
model.add(Dense(1, activation='sigmoid', kernel_initializer=GlorotUniform())) 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(**kwargs)


In [21]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m3167/3167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 705us/step - accuracy: 0.7651 - loss: 0.5108 - val_accuracy: 0.8096 - val_loss: 0.4426
Epoch 2/10
[1m3167/3167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 666us/step - accuracy: 0.7786 - loss: 0.4872 - val_accuracy: 0.8118 - val_loss: 0.4292
Epoch 3/10
[1m3167/3167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 670us/step - accuracy: 0.7777 - loss: 0.4869 - val_accuracy: 0.8117 - val_loss: 0.4254
Epoch 4/10
[1m3167/3167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 664us/step - accuracy: 0.7787 - loss: 0.4829 - val_accuracy: 0.8129 - val_loss: 0.4276
Epoch 5/10
[1m3167/3167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 671us/step - accuracy: 0.7799 - loss: 0.4802 - val_accuracy: 0.8140 - val_loss: 0.4292
Epoch 6/10
[1m3167/3167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 659us/step - accuracy: 0.7814 - loss: 0.4780 - val_accuracy: 0.8111 - val_loss: 0.4227
Epoc

<keras.src.callbacks.history.History at 0x164f52870>

In [22]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 307us/step


In [23]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 81.33%
