In [29]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# load the data
data = pd.read_csv('../data/processed/Weather_processed.csv')

# check if the results folder exists, if not create it
if not os.path.exists("../results/figures"):
    os.makedirs("../results/figures")

In [30]:
# Basic statistics
data.describe()

Unnamed: 0,temperature,humidity,wind_speed,precipitation_(%),atmospheric_pressure,uv_index,visibility_(km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [31]:
 # Select numeric columns for correlation matrix
numeric_cols = data.select_dtypes(include=["float64", "int64"]).columns

In [32]:
# Correlation Matrix
print("Generating Correlation Matrix...")
corr_matrix = data[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.savefig("../results/figures/correlation_matrix.png")
plt.close()

print("Correlation Matrix saved in results/figures folder")

Generating Correlation Matrix...
Correlation Matrix saved in results/figures folder


In [33]:
# Distribution Plots for Numerical Variables
print("Generating Distribution Plots...")
plt.figure(figsize=(12, 10))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(3, 3, i)
    sns.histplot(data[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("../results/figures/distribution_plots.png")
plt.close()

print("Distribution Plots saved in results/figures folder")

Generating Distribution Plots...
Distribution Plots saved in results/figures folder


In [34]:

# Categorical Plots
print("Generating Categorical Plots...")
categorical_cols = ["season", "location", "weather_type"]
plt.figure(figsize=(14, 12))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(3, 1, i)
    sns.countplot(data=data, x=col, hue="weather_type", palette="Set2")
    plt.title(f"Countplot of {col} by weather_type")
    plt.xlabel(col)
    plt.ylabel("Count")

    # Get current Axes object and legend information
    ax = plt.gca()
    handles, labels = ax.get_legend_handles_labels()

    # Only add legend if labels are found
    if labels:
        ax.legend(
            handles=handles, labels=labels, title="Weather Type", loc="upper right"
        )
    else:
        pass
plt.tight_layout()
plt.savefig("../results/figures/categorical_plots.png")
plt.close()

print("Categorical Plots saved in results/figures folder")

Generating Categorical Plots...
Categorical Plots saved in results/figures folder


In [35]:
# Box Plots
print("Generating Box Plots...")
plt.figure(figsize=(12, 8))
sns.boxplot(
    data=data, x="season", y="temperature", hue="weather_type", palette="Set3"
)
plt.title("Boxplot of Temperature across Seasons by weather_type")
plt.xlabel("Season")
plt.ylabel("Temperature")
plt.legend(loc="upper right")
plt.savefig("../results/figures/boxplot_season_temperature.png")
plt.close()

print("Box Plots saved in results/figures folder")

Generating Box Plots...
Box Plots saved in results/figures folder


In [36]:
# print completion message
print("EDA completed.")

EDA completed.
