In [None]:
import pandas as pd
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import matplotlib.pyplot as plt
%matplotlib notebook
import numpy as np
from scipy.stats import skew, kurtosis
from sklearn.mixture import GaussianMixture


os.environ['KAGGLE_CONFIG_DIR'] = r'C:\Users\amitb\.kaggle'
api = KaggleApi()
api.authenticate()
print("Kaggle authentication successful!")

In [None]:
data = pd.read_csv(r"C:\code_projects\applied_DS\rain_in_australia_classifier\weatherAUS.csv")
print(f'There are {data.shape[0]} samples within the dataset')
data.head(10)

In [None]:
data.describe(include='all')

In [None]:
numeric_cols = data.select_dtypes(include=["number"]).columns.tolist()

for numeric_col in numeric_cols:
    data[numeric_col] = data[numeric_col].astype(float)


categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()
categorical_cols.remove('Date')

for categorical_col in categorical_cols:
    data[categorical_col] = data[categorical_col].astype('category')

data['Date'] = data['Date'].astype('datetime64[ns]')

data.dtypes

In [None]:
# checking NA

mising_values_col_sum = pd.Series(data.isnull().sum(), name='% of NA').apply(lambda x: f"{(x / data.shape[0] * 100):.2f}").astype('float').sort_values(ascending=False)
mising_values_col_sum

In [None]:
features = ['Sunshine', 'Evaporation', 'Cloud3pm','Cloud9am','Pressure9am','Pressure3pm']

n_features = len(features)
fig, axes = plt.subplots(n_features, 1, figsize=(8, 4*n_features))  # vertical layout

for i, feature in enumerate(features):
    axes[i].hist(data[feature], bins=20, color='skyblue', edgecolor='black')
    axes[i].set_title(f'{feature} | % missing values {mising_values_col_sum[feature]}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# detect features without normal distribution


results = {'feature': [], 'Skewness': [], 'Excess_Kurtosis': []}

for column in features:
    x = data[column].dropna()
    results['feature'].append(column)
    results['Skewness'].append(skew(x))
    results['Excess_Kurtosis'].append(kurtosis(x))

pd.DataFrame(results)



In [None]:
#fillna with random choise to preserves the skewed distribution and extreme values (for Evaporation which dosen't have bi-mode / normal dist)

observed = data['Evaporation'].dropna()
data['Evaporation'] = data['Evaporation'].fillna(np.random.choice(observed))

In [None]:
# Predict cluster for missing values randomly based on cluster weights for features with bi-modal distribution

bi_mode_features = ['Cloud3pm', 'Cloud9am', 'Sunshine']

for col in bi_mode_features:
    observed = data[col].dropna().values.reshape(-1,1)

    # Fit 2-component GMM
    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(observed)

    # Missing indices
    missing_idx = data[col].isna()
    n_missing = missing_idx.sum()

    # Assign clusters to missing values
    sampled_cluster = np.random.choice(2, size=n_missing, p=gmm.weights_)

    # Get cluster labels for observed values
    cluster_labels = gmm.predict(observed)

    # Precompute cluster-wise observed values for faster sampling
    cluster_values = [observed[cluster_labels == k].flatten() for k in range(2)]

    # Vectorized imputation
    imputed_values = np.array([np.random.choice(cluster_values[k]) for k in sampled_cluster])

    data.loc[missing_idx, col] = imputed_values


missing_after_GaussianMixture = pd.Series(data[bi_mode_features].isna().sum(),name='% missing')
missing_after_GaussianMixture

n_features = len(bi_mode_features)

fig, axes = plt.subplots(n_features, 1, figsize=(8, 4*n_features))  # vertical layout

for i, feature in enumerate(bi_mode_features):
    axes[i].hist(data[feature], bins=20, color='skyblue', edgecolor='black')
    axes[i].set_title(f'{feature} | % missing values: {missing_after_GaussianMixture[feature]}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
#create categorical Season feature

def get_season(date):
    day = date.day
    month = date.month

    if (month == 12 and day >= 21) or (month <= 3 and (month < 3 or day <= 21)):
        return 'Summer'
    elif (month == 3 and day >= 22) or (month <= 6 and (month < 6 or day <= 21)):
        return 'Autumn'
    elif (month == 6 and day >= 22) or (month <= 9 and (month < 9 or day <= 21)):
        return 'Winter'
    elif (month == 9 and day >= 22) or (month <= 12 and (month < 12 or day <= 20)):
        return 'Spring'

data['Season'] = data['Date'].map(get_season)
data['Season'] = data['Season'].astype('category')

In [None]:
binary_dict = {'No': 0, 'Yes':1}

data['RainToday']=data['RainToday'].map(binary_dict)
data['RainTomorrow']=data['RainTomorrow'].map(binary_dict)