<h2>Importing all necessary libraries</h2>

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
import plotly.express as px
from sklearn.model_selection import train_test_split 

<h2>Reading the data</h2>

In [None]:
rain_df = pd.read_csv('data/weatherAUS.csv')

<h2>Exploring Data</h2>

In [None]:
rain_df.dropna(subset=['RainToday','RainTomorrow'], inplace=True)

In [None]:
rain_df['RainToday'] = rain_df.RainToday.map({'No':0, 'Yes':1}).astype(int)
rain_df['RainTomorrow'] = rain_df.RainTomorrow.map({'No':0, 'Yes':1}).astype(int)

In [None]:
numerical_cols = rain_df.select_dtypes(include=['int32','float64']).columns.tolist()
cat_cols = rain_df.select_dtypes('object').columns.tolist()

<h2>Rainfall</h2>

In [None]:
px.histogram(rain_df, x='Rainfall', color='RainTomorrow')

In [None]:
rain_df['Rainfall'].skew()

In [None]:
rain_df['Rainfall'] = np.log1p(rain_df['Rainfall'])

In [None]:
px.histogram(rain_df.sample(2000),x='Rainfall', color='RainTomorrow')

<h2>Gaussian Imputer</h2>

In [None]:
def GaussianImpute(col):
    mean = col.mean()
    var = col.std()

    is_na = col.isna()
    no_of_missing_values = is_na.sum()
    
    imputed_values = np.random.normal(mean, var, size=no_of_missing_values)
    
    imputed_col = col.copy()
    imputed_col[is_na] = imputed_values
    
    return imputed_col

<h2>Primary Idea Function</h2>

In [None]:
def display(col):
    print('Skew:', rain_df[col].skew())
    print('Cor:',rain_df[col].corr(rain_df.RainTomorrow))
    print('No of missing values:', rain_df[col].isna().sum())
    fig = px.histogram(rain_df, x=col, color='RainTomorrow')
    fig.show()

<h2>WindGusSpeed</h2>

In [None]:
display('WindGustSpeed')

In [None]:
rain_df['WindGustSpeed'] = GaussianImpute(rain_df['WindGustSpeed'])

In [None]:
display('WindGustSpeed')

In [None]:
rain_df['WindGustSpeed'].describe()

In [None]:
rain_df['WindGustSpeed'] = np.log1p(rain_df['WindGustSpeed'])

<h2>Humidity</h2>

In [None]:
display('Humidity3pm')

In [None]:
sns.histplot(rain_df['Humidity3pm'], kde=True)

In [None]:
rain_df['Humidity3pm'].describe()

In [None]:
rain_df['Humidity3pm'] = GaussianImpute(rain_df['Humidity3pm'])

In [None]:
display('Humidity3pm')

<h2>Pressure</h2>

In [None]:
display('Pressure9am')

In [None]:
rain_df['Pressure9am'] = GaussianImpute(rain_df['Pressure9am'])

In [None]:
display('Pressure9am')

<h2>Temp</h2>

In [None]:
display('Temp3pm')

In [None]:
rain_df['Temp3pm'] = GaussianImpute(rain_df['Temp3pm'])

In [None]:
display('Temp3pm')

<h2>Creating new column: Rain Risk</h2>

In [None]:
rain_df['rain_risk'] = (rain_df['Humidity3pm']/100) * (1-(rain_df['Temp3pm'])/100) * rain_df['WindGustSpeed']

In [None]:
display('rain_risk')

<h2>Creating new column: Temp Diff</h2>

In [None]:
rain_df['MaxTemp'] = GaussianImpute(rain_df['MaxTemp'])
rain_df['MinTemp'] = GaussianImpute(rain_df['MinTemp'])

In [None]:
rain_df['temp_dif'] = rain_df['MaxTemp']-rain_df['MinTemp']

In [None]:
display('temp_dif')

<h2>Defining Columns</h2>

In [None]:
numeric_cols = ['Rainfall','WindGustSpeed','Humidity3pm','Pressure9am','Temp3pm', 'RainToday']
numeric_cols

In [None]:
cat_cols = ['Location', 'WindDir3pm']
cat_cols

In [None]:
target_cols = 'RainTomorrow'

In [None]:
input_cols = numeric_cols + cat_cols
input_cols

In [None]:
rain_df.columns

<h2>Imputing Categorical Columns</h2>

In [None]:
rain_df['WindDir3pm'] = rain_df['WindDir3pm'].fillna(rain_df['WindDir3pm'].mode()[0])

In [None]:
rain_df[cat_cols].isna().sum()

In [None]:
px.histogram(rain_df, x='WindDir3pm', color='RainTomorrow')

In [None]:
rain_df[cat_cols].describe()

<h2>Spliting Train, Validation, Test Data</h2>

In [None]:
sns.countplot(x=pd.to_datetime(rain_df.Date).dt.year)

In [None]:
train_df = rain_df[year<2015]
val_df = rain_df[year == 2015]
test_df = rain_df[year>2015]

In [None]:
train_inputs = train_df[input_cols].copy()
train_target = train_df[target_col].copy()
val_inputs = val_df[input_cols].copy()
val_target = val_df[target_col].copy()
test_inputs = test_df[input_cols].copy()
test_target = test_df[target_col].copy()

<h1>Scaling Down</h1>
<p>In this section, I am trying to scale down all the numeric columns values to create a level playing field using MinMaxScaler</p>
<p>This will bar the model to be more influenced by the larger value columns</p>

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(train_df[numeric_cols])

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].skew()

In [None]:
import os
os.getcwd()

<h1>Encoder for Categorical Columns</h1>
<p>Since I dealt with numeric columns with MinMaxScaler, now I am using OneHotEncoder to cope with categorical columns</p>

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [None]:
train_inputs[cat_cols].isna().sum()

In [None]:
encoder.fit(train_inputs[cat_cols])

In [None]:
encoded_cols = encoder.get_feature_names_out(cat_cols)
encoded_cols

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[cat_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[cat_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[cat_cols])

<h2>Logistic Regression Model</h2>

In [None]:
from sklearn.linear_model import LogisticRegression 

In [None]:
model = LogisticRegression(solver='liblinear',max_iter=100)

In [None]:
model.fit(train_inputs[numeric_cols + list(encoded_cols)], train_target)

In [None]:
print(model.coef_.tolist())

In [None]:
weight_df = pd.DataFrame({
    'feature': numeric_cols + encoded_cols.tolist(),
    'weight': model.coef_.tolist()[0]
})
weight_df.sort_values(by='weight', ascending=False)

In [None]:
model.intercept_

In [None]:
sns.barplot(data=weight_df.sort_values(by='weight', ascending=False).head(10), x='weight', y='feature')

In [None]:
px.histogram(rain_df, x='Cloud3pm', color='RainTomorrow')

In [None]:
x_train = train_inputs[numeric_cols + encoded_cols.tolist()]
x_val = val_inputs[numeric_cols + encoded_cols.tolist()]
x_test = test_inputs[numeric_cols + encoded_cols.tolist()]

In [None]:
train_predict = model.predict(x_train)

In [None]:
train_predict

In [None]:
train_target

In [None]:
np.sum(np.array(train_predict) != np.array(train_target))

In [None]:
train_target.shape

<h2>Accuracy and Confusion Matrix</h2>

In [None]:
from sklearn.metrics import accuracy_score 

In [None]:
accuracy_score(train_target, train_predict)

In [None]:
train_probabs = model.predict_proba(x_train)
train_probabs

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(train_target, train_predict, normalize='true')

In [None]:
def conf_mat(inputs, target, name=''):
    predict = model.predict(inputs)
    acc_score = accuracy_score(target, predict)
    print("Accuracy: {:.2f}%".format(acc_score*100))
    matrix = confusion_matrix(target, predict, normalize='true')

    plt.figure()
    sns.heatmap(matrix, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name))

    return predict

In [None]:
train_predict = conf_mat(x_train, train_target, 'Training')

In [None]:
val_predict = conf_mat(x_val, val_target, 'Validation')

In [None]:
test_predict = conf_mat(x_test, test_target, 'Test')