In [1]:
# Install required packages if not already installed
# !pip install kaggle pandas numpy matplotlib seaborn scikit-learn

# Import necessary libraries
import pandas as pd
import zipfile
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Optional
import os
from sklearn.preprocessing import LabelEncoder



In [2]:
# Download the weather dataset from Kaggle
import kaggle

def download_weather_dataset() -> None:
    """
    Download the weather dataset from Kaggle using the API.
    
    Requires kaggle API credentials to be properly set up.
    """
    try:
        # Download the dataset
        kaggle.api.dataset_download_files(
            'jsphyg/weather-dataset-rattle-package',
            path='./data',
            unzip=False
        )
        print("Dataset downloaded successfully!")
    except Exception as e:
        print(f"Error downloading dataset: {e}")
        print("Make sure your Kaggle API credentials are set up correctly.")

# Create data directory if it doesn't exist
os.makedirs('./data', exist_ok=True)

# Download the dataset
download_weather_dataset()


Dataset URL: https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package
Dataset downloaded successfully!


In [3]:
# Unzip the downloaded dataset

def unzip_dataset() -> None:
    """
    Unzip the downloaded weather dataset.
    """
    zip_path = './data/weather-dataset-rattle-package.zip'
    extract_path = './data/'
    
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print("Dataset unzipped successfully!")
        
        # List extracted files
        print("\nExtracted files:")
        for file in os.listdir(extract_path):
            if file.endswith('.csv'):
                print(f"  - {file}")
                
    except FileNotFoundError:
        print(f"Error: {zip_path} not found. Make sure the dataset was downloaded successfully.")
    except Exception as e:
        print(f"Error unzipping dataset: {e}")

# Unzip the dataset
unzip_dataset()


Dataset unzipped successfully!

Extracted files:
  - weatherAUS.csv


In [4]:
# Load the weather dataset
df = pd.read_csv('./data/weatherAUS.csv')

# Display basic information about the dataset
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
df.head()

Dataset loaded successfully!
Shape: (145460, 23)
Columns: ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']

First 5 rows:


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [5]:
# Debug: Check what values are actually in RainToday and RainTomorrow
print("Debugging the binary encoding issue:")
print(f"RainToday unique values: {df['RainToday'].unique()}")
print(f"RainTomorrow unique values: {df['RainTomorrow'].unique()}")
print(f"RainToday data type: {df['RainToday'].dtype}")
print(f"RainTomorrow data type: {df['RainTomorrow'].dtype}")

# Check for any null values
print(f"RainToday null count: {df['RainToday'].isnull().sum()}")
print(f"RainTomorrow null count: {df['RainTomorrow'].isnull().sum()}")

# Sample of actual values
print(f"First 10 RainToday values: {df['RainToday'].head(10).tolist()}")
print(f"First 10 RainTomorrow values: {df['RainTomorrow'].head(10).tolist()}")


Debugging the binary encoding issue:
RainToday unique values: ['No' 'Yes' nan]
RainTomorrow unique values: ['No' 'Yes' nan]
RainToday data type: object
RainTomorrow data type: object
RainToday null count: 3261
RainTomorrow null count: 3267
First 10 RainToday values: ['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes']
First 10 RainTomorrow values: ['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No']


## Preprocessing Step

1. We will extract month, day_of_year, season, and weekday from the [Date] column
2. Appy One-Hot Encoding to columns [Location, WindGustDir, WindDir9am, WindDir3pm] and Encoding to [RainToday, RainTomorrow]
3. Apply MinMax scaling to [MinTemp, MaxTemp, Temp9am, Temp3pm, Rainfall, Evaporation, Sunshine, WindGustSpeed, WindSpeed9am, WindSPeed3pm, Pressure9am, Pressure3pm]
4. Humidity9am, Humidity3pm: Already 0-100%, divide by 100 |AND|
Cloud9am, Cloud3pm: Already 0-8 scale, divide by 8


## 1. Preprocess Data Column

In [6]:
# Extract date features from the Date column
def extract_date_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract useful features from the Date column.
    
    Args:
        df: DataFrame with a 'Date' column
        
    Returns:
        DataFrame with new date features and original Date column removed
    """
    # Make a copy to avoid modifying original
    df_processed = df.copy()
    
    # Convert Date to datetime if it's not already
    df_processed['Date'] = pd.to_datetime(df_processed['Date'])
    
    # Extract month (1-12)
    df_processed['month'] = df_processed['Date'].dt.month
    
    # Extract weekday (0=Monday, 6=Sunday)
    df_processed['weekday'] = df_processed['Date'].dt.dayofweek
    
    # Extract season (0=Spring, 1=Summer, 2=Autumn, 3=Winter)
    def get_season(month: int) -> int:
        if month in [12, 1, 2]:
            return 1  # Summer
        elif month in [3, 4, 5]:
            return 2  # Autumn
        elif month in [6, 7, 8]:
            return 3  # Winter
        else:  # month in [9, 10, 11]
            return 0  # Spring
    
    df_processed['season'] = df_processed['month'].apply(get_season)
    
    # Drop the original Date column
    df_processed = df_processed.drop('Date', axis=1)
    
    print("Date features extracted:")
    print("- month: 1-12")
    print("- weekday: 0=Monday, 6=Sunday")
    print("- season: 0=Spring, 1=Summer, 2=Autumn, 3=Winter (Southern Hemisphere)")
    
    return df_processed

# Apply date feature extraction
df = extract_date_features(df)

# Display the new columns
print(f"\nNew shape: {df.shape}")
print(f"New columns: {list(df.columns[:10])}")  # Show first 10 columns
print("\nSample of extracted date features:")
print(df[['month', 'weekday', 'season']].head())
df.head()


Date features extracted:
- month: 1-12
- weekday: 0=Monday, 6=Sunday
- season: 0=Spring, 1=Summer, 2=Autumn, 3=Winter (Southern Hemisphere)

New shape: (145460, 25)
New columns: ['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm']

Sample of extracted date features:
   month  weekday  season
0     12        0       1
1     12        1       1
2     12        2       1
3     12        3       1
4     12        4       1


Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,month,weekday,season
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,1007.1,8.0,,16.9,21.8,No,No,12,0,1
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,1007.8,,,17.2,24.3,No,No,12,1,1
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,1008.7,,2.0,21.0,23.2,No,No,12,2,1
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,1012.8,,,18.1,26.5,No,No,12,3,1
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,No,12,4,1


In [7]:
# 2. Cyclical encoding for temporal features
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# One-hot encoding for season
season_dummies = pd.get_dummies(df['season'], prefix='season')
df = pd.concat([df, season_dummies], axis=1)

# Drop original temporal columns
df = df.drop(['month', 'weekday', 'season'], axis=1)
df.head()


Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Temp9am,Temp3pm,RainToday,RainTomorrow,month_sin,month_cos,season_0,season_1,season_2,season_3
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,16.9,21.8,No,No,-2.449294e-16,1.0,False,True,False,False
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,17.2,24.3,No,No,-2.449294e-16,1.0,False,True,False,False
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,21.0,23.2,No,No,-2.449294e-16,1.0,False,True,False,False
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,18.1,26.5,No,No,-2.449294e-16,1.0,False,True,False,False
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,17.8,29.7,No,No,-2.449294e-16,1.0,False,True,False,False


## 2. One-Hot Encoding Step


In [8]:
# One-hot encoding for categorical columns and binary encoding for Yes/No columns
# One-hot encoding for categorical columns
categorical_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
for col in categorical_cols:
    dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, dummies], axis=1)

# Binary encoding for Yes/No columns
df['RainToday'] = df['RainToday'].map({'No': 0, 'Yes': 1})
df['RainTomorrow'] = df['RainTomorrow'].map({'No': 0, 'Yes': 1})

# Drop original categorical columns
df = df.drop(categorical_cols, axis=1)

print(f"Shape after one-hot encoding: {df.shape}")
print(f"Sample of encoded columns:")
print(df[['RainToday', 'RainTomorrow']].head())

Shape after one-hot encoding: (145460, 121)
Sample of encoded columns:
   RainToday  RainTomorrow
0        0.0           0.0
1        0.0           0.0
2        0.0           0.0
3        0.0           0.0
4        0.0           0.0


## 3. MinMax Scaling Step


In [9]:
# MinMax scaling for numerical columns
# Define columns for MinMax scaling
minmax_cols = ['MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm', 'Rainfall', 
               'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 
               'WindSpeed3pm', 'Pressure9am', 'Pressure3pm']

# Apply MinMax scaling
scaler = MinMaxScaler()
df[minmax_cols] = scaler.fit_transform(df[minmax_cols])

print(f"MinMax scaling applied to {len(minmax_cols)} columns")
print(f"Sample of scaled columns:")
print(df[minmax_cols[:4]].head())


MinMax scaling applied to 12 columns
Sample of scaled columns:
    MinTemp   MaxTemp   Temp9am   Temp3pm
0  0.516509  0.523629  0.508439  0.522073
1  0.375000  0.565217  0.514768  0.570058
2  0.504717  0.576560  0.594937  0.548944
3  0.417453  0.620038  0.533755  0.612284
4  0.613208  0.701323  0.527426  0.673704


## 4. Simple Division Scaling Step


In [10]:
# Simple division scaling for already scaled columns
# Humidity columns: 0-100% → divide by 100
humidity_cols = ['Humidity9am', 'Humidity3pm']
df[humidity_cols] = df[humidity_cols] / 100

# Cloud columns: 0-8 scale → divide by 8
cloud_cols = ['Cloud9am', 'Cloud3pm']
df[cloud_cols] = df[cloud_cols] / 8

print(f"Division scaling applied")
print(f"Humidity range: {df[humidity_cols].min().min():.2f} - {df[humidity_cols].max().max():.2f}")
print(f"Cloud range: {df[cloud_cols].min().min():.2f} - {df[cloud_cols].max().max():.2f}")
print(f"\nFinal shape: {df.shape}")


Division scaling applied
Humidity range: 0.00 - 1.00
Cloud range: 0.00 - 1.12

Final shape: (145460, 121)


In [11]:
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,0.516509,0.523629,0.001617,,,0.294574,0.153846,0.275862,0.71,0.22,...,False,False,False,False,False,False,False,False,True,False
1,0.375,0.565217,0.0,,,0.294574,0.030769,0.252874,0.44,0.25,...,False,False,False,False,False,False,False,False,False,True
2,0.504717,0.57656,0.0,,,0.310078,0.146154,0.298851,0.38,0.3,...,False,False,False,False,False,False,False,False,False,True
3,0.417453,0.620038,0.0,,,0.139535,0.084615,0.103448,0.45,0.16,...,False,False,False,False,False,False,False,False,False,False
4,0.613208,0.701323,0.002695,,,0.271318,0.053846,0.229885,0.82,0.33,...,False,True,False,False,False,False,False,False,False,False
