In [191]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [192]:
df = pd.read_csv('WeatherAUS.csv')

In [193]:
df.shape

(36881, 24)

In [194]:
df.duplicated().sum()

np.int64(0)

In [195]:
df.isnull().sum()

Date                 0
Location             0
MinTemp            338
MaxTemp            242
Rainfall           626
Evaporation      12846
Sunshine         13564
WindGustDir       3368
WindGustSpeed     3361
WindDir9am        2809
WindDir3pm         962
WindSpeed9am       662
WindSpeed3pm       646
Humidity9am        570
Humidity3pm        511
Pressure9am       3572
Pressure3pm       3552
Cloud9am         12500
Cloud3pm         12982
Temp9am            487
Temp3pm            444
RainToday          626
RISK_MM            620
RainTomorrow       620
dtype: int64

In [196]:
numeric_features = ['MinTemp', ]

In [197]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')

In [198]:
numeric_features = ['Temp9am', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
               'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']

In [199]:
for col in numeric_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1
    min = Q1 - 1.5*IQR
    max = Q3 + 1.5*IQR
    df = df[(df[col]>=min) & (df[col]<=max)]

In [200]:
df.shape

(13309, 24)

In [201]:
df.isnull().sum()

Date               0
Location           0
MinTemp            5
MaxTemp            0
Rainfall           0
Evaporation        0
Sunshine           0
WindGustDir        1
WindGustSpeed      0
WindDir9am       406
WindDir3pm        42
WindSpeed9am       0
WindSpeed3pm       0
Humidity9am        0
Humidity3pm        0
Pressure9am        0
Pressure3pm        0
Cloud9am           0
Cloud3pm           0
Temp9am            0
Temp3pm            0
RainToday          0
RISK_MM           15
RainTomorrow      15
dtype: int64

In [202]:
df['WindDir9am'].fillna(df['WindDir9am'].mode()[0], inplace=True)
df['WindDir3pm'].fillna(df['WindDir3pm'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['WindDir9am'].fillna(df['WindDir9am'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['WindDir3pm'].fillna(df['WindDir3pm'].mode()[0], inplace=True)


In [203]:
df.dropna(inplace=True)
df.shape

(13288, 24)

In [204]:
corr = df.corr(numeric_only=True)
filtered_corr = corr[(corr.abs() > 0.5) & (corr.abs() != 1.0)]
filtered_corr

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RISK_MM
MinTemp,,0.773167,,0.602721,,,,,,,-0.553078,-0.547286,,,0.921588,0.749755,
MaxTemp,0.773167,,,0.716576,,,,,,,,-0.543179,,,0.884931,0.986198,
Rainfall,,,,,,,,,,,,,,,,,
Evaporation,0.602721,0.716576,,,,,,,-0.567728,,,,,,0.665115,0.69737,
Sunshine,,,,,,,,,,-0.580356,,,-0.661156,-0.685046,,,
WindGustSpeed,,,,,,,0.54446,0.65165,,,,,,,,,
WindSpeed9am,,,,,,0.54446,,,,,,,,,,,
WindSpeed3pm,,,,,,0.65165,,,,,,,,,,,
Humidity9am,,,,-0.567728,,,,,,0.601936,,,,,,,
Humidity3pm,,,,,-0.580356,,,,0.601936,,,,,,,,


In [205]:
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
df['Location'] = lb.fit_transform(df['Location'])
df['WindGustDir'] = lb.fit_transform(df['WindGustDir'])
df = pd.get_dummies(df, columns=['RainToday'])

In [206]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RISK_MM', 'RainTomorrow', 'RainToday_No', 'RainToday_Yes'],
      dtype='object')

In [207]:
features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustDir', 'Location', 'RainToday_No', 'RainToday_Yes']
X = df[features]
y = df['RainTomorrow']

In [208]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [209]:
X_train.isna().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
WindGustDir      0
Location         0
RainToday_No     0
RainToday_Yes    0
dtype: int64

In [210]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [211]:
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_pred, y_test)
acc

0.8404815650865313