In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.append('..')
from Notebooks_and_Scripts.utils import convert_units, TARGET_COLS, KELVIN_THRESHOLD

In [None]:
# Create a directory for plots 
PLOTS_DIR = "plots"
os.makedirs(PLOTS_DIR, exist_ok=True)

# Set style 
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['savefig.dpi'] = 300  # Higher resolution for better quality
plt.rcParams['savefig.bbox'] = 'tight'

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load and Examine Data

print("Loading data...")

# Load datasets
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
sample_submission = pd.read_csv("../data/sample_submission.csv")

# Display basic information
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display sample data
print("\nSample data from training set:")
print(train.head())

# Check for missing values
print("\nMissing values in train:")
print(train.isnull().sum())

print("\nMissing values in test:")
print(test.isnull().sum())

# Convert temperature units if needed
train = convert_units(train)

# Check unique kingdoms
kingdoms = train['kingdom'].unique()
print(f"\nNumber of unique kingdoms: {len(kingdoms)}")
print(f"Kingdoms: {kingdoms}")

In [None]:
# Explore Target Variables
print(train[TARGET_COLS].describe())

# Visualize distributions of target variables
plt.figure(figsize=(18, 12))

for i, col in enumerate(TARGET_COLS):
    plt.subplot(3, 2, i+1)
    sns.histplot(train[col], kde=True, color='steelblue')
    plt.title(f'Distribution of {col}', fontsize=14)
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "target_distributions.png"))
plt.close()

print(f"Saved: {os.path.join(PLOTS_DIR, 'target_distributions.png')}")

# Create date column for time series analysis
train['date'] = pd.to_datetime(
    train['Year'].astype(str) + '-' + 
    train['Month'].astype(str).str.zfill(2) + '-' + 
    train['Day'].astype(str).str.zfill(2)
)

# Group by date for time series visualization
train_daily = train.groupby('date')[TARGET_COLS].mean().reset_index()

# Plot time series for each target variable
plt.figure(figsize=(20, 18))
for i, col in enumerate(TARGET_COLS):
    plt.subplot(5, 1, i+1)
    plt.plot(train_daily['date'], train_daily[col], linewidth=1.5, color='darkblue')
    plt.title(f'Time Series of {col}', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "time_series_targets.png"))
plt.close()
print(f"Saved: {os.path.join(PLOTS_DIR, 'time_series_targets.png')}")

In [None]:
# Spatial Analysis

plt.figure(figsize=(14, 12))

palette = sns.color_palette("husl", len(kingdoms))
sns.scatterplot(
    data=train.drop_duplicates(subset=['kingdom', 'latitude', 'longitude']), 
    x='longitude', 
    y='latitude', 
    hue='kingdom', 
    s=100,
    palette=palette,
    alpha=0.8
)

plt.title('Spatial Distribution of Kingdoms', fontsize=16)
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "kingdom_spatial_distribution.png"))
plt.close()

print(f"Saved: {os.path.join(PLOTS_DIR, 'kingdom_spatial_distribution.png')}")

# Calculate average target variables by kingdom
kingdom_stats = train.groupby('kingdom')[TARGET_COLS].mean().reset_index()

print("\nTarget variables average by kingdom:")
print(kingdom_stats)

# Visualize average target variables by kingdom
fig, axes = plt.subplots(len(TARGET_COLS), 1, figsize=(12, 5*len(TARGET_COLS)))

for i, col in enumerate(TARGET_COLS):
    kingdom_stats_sorted = kingdom_stats.sort_values(by=col)
    barplot = sns.barplot(x=col, y='kingdom', data=kingdom_stats_sorted, ax=axes[i], palette='viridis')
    axes[i].set_title(f'Average {col} by Kingdom', fontsize=14)
    axes[i].grid(True, alpha=0.3)

    for j, p in enumerate(barplot.patches):
        width = p.get_width()
        axes[i].text(width + 0.1, p.get_y() + p.get_height()/2, 
                f'{width:.1f}', ha='left', va='center')
                
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "kingdom_target_averages.png"))
plt.close()
print(f"Saved: {os.path.join(PLOTS_DIR, 'kingdom_target_averages.png')}")


In [None]:
# Temporal Analysis

train['month'] = train['date'].dt.month
train['season'] = (train['month'] % 12 + 3) // 3
season_map = {1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall'}
train['season_name'] = train['season'].map(season_map)

# Plot seasonal patterns
fig, axes = plt.subplots(3, 2, figsize=(18, 16))
axes = axes.flatten()
season_colors = {'Winter': 'skyblue', 'Spring': 'yellowgreen', 
                 'Summer': 'orange', 'Fall': 'brown'}
                 
for i, col in enumerate(TARGET_COLS):
    if i < len(axes):
        sns.boxplot(
            data=train, 
            x='season_name', 
            y=col, 
            order=['Winter', 'Spring', 'Summer', 'Fall'],
            ax=axes[i],
            palette=season_colors
        )
        axes[i].set_title(f'Seasonal patterns of {col}', fontsize=14)
        axes[i].set_xlabel('Season', fontsize=12)
        axes[i].set_ylabel(col, fontsize=12)
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "seasonal_patterns.png"))
plt.close()

print(f"Saved: {os.path.join(PLOTS_DIR, 'seasonal_patterns.png')}")

# Monthly analysis
monthly_means = train.groupby('month')[TARGET_COLS].mean()

plt.figure(figsize=(18, 16))
for i, col in enumerate(TARGET_COLS):
    plt.subplot(3, 2, i+1)
    monthly_means[col].plot(marker='o', linestyle='-', linewidth=2, markersize=8, color='forestgreen')
    plt.title(f'Monthly Average of {col}', fontsize=14)
    plt.xlabel('Month', fontsize=12)
    plt.ylabel(col, fontsize=12)
    plt.xticks(range(1, 13))
    plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "monthly_averages.png"))
plt.close()

print(f"Saved: {os.path.join(PLOTS_DIR, 'monthly_averages.png')}")

In [None]:
# Correlation Analysis

features_for_corr = TARGET_COLS + ['Temperature_Range', 'Feels_Like_Temperature_Range', 
                               'Rain_Duration', 'Evapotranspiration', 'Avg_Feels_Like_Temperature']
corr = train[features_for_corr].corr()

# Show full correlation matrix 
plt.figure(figsize=(16, 14))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(
    corr, 
    cmap=cmap, 
    vmax=1, 
    vmin=-1, 
    center=0,
    square=True, 
    linewidths=.5, 
    annot=True, 
    fmt='.2f',
    annot_kws={"size": 10}
)

plt.title('Correlation Matrix (Full)', fontsize=18)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "correlation_matrix_full.png"))
plt.close()

# Show lower triangle only 
plt.figure(figsize=(16, 14))
# Create mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(
    corr, 
    mask=mask,  # This hides the upper triangle
    cmap=cmap, 
    vmax=1, 
    vmin=-1, 
    center=0,
    square=True, 
    linewidths=.5, 
    annot=True, 
    fmt='.2f',
    annot_kws={"size": 10}
)

plt.title('Correlation Matrix (Lower Triangle)', fontsize=18)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_DIR, "correlation_matrix_lower.png"))
plt.close()

print(f"Saved correlation matrices to {PLOTS_DIR}")