In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
dataset = pd.read_csv(r"datasets/weatherAUS.csv")

In [3]:
df = pd.DataFrame(dataset)

In [4]:
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [5]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [6]:
print(X)

[['2008-12-01' 'Albury' 13.4 ... 16.9 21.8 'No']
 ['2008-12-02' 'Albury' 7.4 ... 17.2 24.3 'No']
 ['2008-12-03' 'Albury' 12.9 ... 21.0 23.2 'No']
 ...
 ['2017-06-23' 'Uluru' 5.4 ... 12.5 26.1 'No']
 ['2017-06-24' 'Uluru' 7.8 ... 15.1 26.0 'No']
 ['2017-06-25' 'Uluru' 14.9 ... 15.0 20.9 'No']]


In [7]:
print(y)

['No' 'No' 'No' ... 'No' 'No' nan]


In [8]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [9]:
df1 = df.copy()

In [10]:
print("Before:",df1.shape)
df1 = df1.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)
print("After:",df1.shape)

Before: (145460, 23)
After: (145460, 19)


In [11]:
from sklearn.impute import SimpleImputer

num_cols = df1.select_dtypes(include=['float64']).columns
num_imputer = SimpleImputer(strategy='median')
df1[num_cols] = num_imputer.fit_transform(df1[num_cols])


In [12]:
print(X)

[['2008-12-01' 'Albury' 13.4 ... 16.9 21.8 'No']
 ['2008-12-02' 'Albury' 7.4 ... 17.2 24.3 'No']
 ['2008-12-03' 'Albury' 12.9 ... 21.0 23.2 'No']
 ...
 ['2017-06-23' 'Uluru' 5.4 ... 12.5 26.1 'No']
 ['2017-06-24' 'Uluru' 7.8 ... 15.1 26.0 'No']
 ['2017-06-25' 'Uluru' 14.9 ... 15.0 20.9 'No']]


In [13]:
cat_cols = df1.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
df1[cat_cols] = cat_imputer.fit_transform(df1[cat_cols])


In [14]:
df1.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [15]:
cat_cols = df1.select_dtypes(include=['object']).columns
print(cat_cols)

Index(['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm',
       'RainToday', 'RainTomorrow'],
      dtype='object')


In [16]:
df1['Date'] = pd.to_datetime(df1['Date'], errors='coerce')
df1['Year'] = df1['Date'].dt.year
df1['Month'] = df1['Date'].dt.month
df1['Day'] = df1['Date'].dt.day
df1 = df1.drop('Date', axis=1)

In [17]:
cols_to_encode = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
df1 = pd.get_dummies(df1, columns=cols_to_encode, drop_first=True)

In [18]:
df1['RainToday'] = df1['RainToday'].map({'Yes': 1, 'No': 0})
df1['RainTomorrow'] = df1['RainTomorrow'].map({'Yes': 1, 'No': 0})

In [19]:
print(df1.select_dtypes(include=['object']).columns)

Index([], dtype='object')


In [20]:
X = df1.drop('RainTomorrow', axis=1)
y = df1['RainTomorrow']

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      
    random_state=42,    
    stratify=y          
)


In [22]:
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)

Training set: (116368, 109) (116368,)
Testing set: (29092, 109) (29092,)
