## EDA for seattle_weather (pulled from Kaggle)


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_df = pd.read_csv("data/seattle-weather.csv")
print(data_df.head())

In [None]:
# Unique Weather values
print(data_df['weather'].unique())

In [None]:
# How many days of each weather type?
weather_counts = data_df["weather"].value_counts()
print(weather_counts)

In [None]:
# Summary statistics for numeric features
print(data_df[["precipitation", "temp_max", "temp_min", "wind"]].describe())

# Correlation matrix
plt.figure(figsize=(12, 8))
corr = data_df[["precipitation", "temp_max", "temp_min", "wind"]].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Converting the date column to datetime
data_df["date"] = pd.to_datetime(data_df["date"])

# Add month and year columns
data_df["month"] = data_df["date"].dt.month
data_df["year"]  = data_df["date"].dt.year

# Monthly average temperature and precipitation
monthly = data_df.groupby("month")[["temp_max", "temp_min", "precipitation"]].mean()

plt.figure()
monthly["temp_max"].plot()
plt.title("Average Max Temp by Month")
plt.xlabel("Month")
plt.ylabel("Temp (°C)")
plt.show()

plt.figure()
monthly["precipitation"].plot()
plt.title("Average Precipitation by Month")
plt.xlabel("Month")
plt.ylabel("Precip (mm)")
plt.show()


We're going to build a Markov Chain model to predict which of the 5 weather states the next day will take!

In [None]:
# 1. Enumerate states
states = data_df["weather"].unique()
state_to_idx = {s: i for i, s in enumerate(states)}
idx_to_state = {i: s for s, i in state_to_idx.items()}

# 2. Build transition count matrix
n = len(states)
counts = np.zeros((n, n), dtype=int)

# walk through consecutive days
for today, tomorrow in zip(data_df["weather"][:-1], data_df["weather"][1:]):
    i = state_to_idx[today]
    j = state_to_idx[tomorrow]
    counts[i, j] += 1

# 3. Convert counts to probabilities (row‑normalize)
#    if a row sums to zero (shouldnt happen with this data), leave uniform
probs = np.zeros_like(counts, dtype=float)
row_sums = counts.sum(axis=1, keepdims=True)
nonzero = row_sums[:,0] != 0
probs[nonzero] = counts[nonzero] / row_sums[nonzero]
probs[~nonzero] = 1.0 / n

# wrap in a DataFrame for readability
transition_df = pd.DataFrame(probs, index=states, columns=states)
print("Transition probability matrix:\n", transition_df)

# 5. Prediction functions
def predict_next_distribution(today_state):
    """Return a Series of P(tomorrow = s | today = today_state)."""
    return transition_df.loc[today_state]

def predict_most_likely(today_state):
    """Return the single most likely next‑day weather."""
    return transition_df.loc[today_state].idxmax()

# 6. Example usage
for s in states:
    print(f"If today is {s:7s}, tomorrow is most likely: {predict_most_likely(s)}")

## Model Validation

In [None]:
correct = 0
total   = len(data_df) - 1   # number of transitions
for today, actual_tomorrow in zip(data_df["weather"][:-1], data_df["weather"][1:]):
    if predict_most_likely(today) == actual_tomorrow:
        correct += 1

accuracy = correct / total
print(f"Validation accuracy: {accuracy:.2%}  ({correct}/{total} correct)")


Next steps: Creating composite states that will improve the accuracy of our model

In [None]:
# 1. Define pure weather states and composite (weather, month) states
weather_states    = sorted(data_df["weather"].unique())
composite_states  = sorted(data_df[["weather","month"]]
                           .drop_duplicates()
                           .apply(lambda row: f"{row.weather}_{row.month:02d}", axis=1))

w2i = {w:i for i,w in enumerate(weather_states)}
c2i = {c:i for i,c in enumerate(composite_states)}

# 2. Build count matrix of shape (n_composite × n_weather)
n_c, n_w = len(composite_states), len(weather_states)
counts = np.zeros((n_c, n_w), dtype=int)

# 3. Tally transitions: (weatherₜ, monthₜ) → weatherₜ₊₁
for (w_t, m_t), w_t1 in zip(data_df[["weather","month"]][:-1].itertuples(index=False),
                             data_df["weather"][1:]):
    comp = f"{w_t}_{m_t:02d}"
    counts[c2i[comp], w2i[w_t1]] += 1

# 4. Normalize counts to probabilities (row‑wise)
probs = np.zeros_like(counts, dtype=float)
row_sums = counts.sum(axis=1, keepdims=True)
nonzero  = (row_sums[:,0] != 0)
probs[nonzero] = counts[nonzero] / row_sums[nonzero]
probs[~nonzero] = 1.0 / n_w  # if we ever had a zero‑row

# 5. Put into a DataFrame
transition_df = pd.DataFrame(
    probs,
    index=composite_states,
    columns=weather_states
)

print("P(next_weather | today_weather, today_month):")
print(transition_df)

# 6. Prediction helpers — uses both weather and month
def predict_next_dist(today_weather, today_month):
    comp = f"{today_weather}_{today_month:02d}"
    return transition_df.loc[comp]

def predict_next_most_likely(today_weather, today_month):
    return predict_next_dist(today_weather, today_month).idxmax()

# 7. Example: what’s most likely tomorrow if today is rain in January?
print("\nExample:")
print("  Today = rain, month = 01 → tomorrow most likely:",
      predict_next_most_likely("rain", 1))


In [None]:
# Validation for composite (weather, month) Markov model
correct = 0
total   = len(data_df) - 1   # number of transitions

for idx in range(total):
    today_weather = data_df.loc[idx, "weather"]
    today_month   = data_df.loc[idx, "month"]
    actual_next   = data_df.loc[idx + 1, "weather"]

    # predict using composite-state function
    pred = predict_next_most_likely(today_weather, today_month)

    if pred == actual_next:
        correct += 1

accuracy = correct / total
print(f"Validation accuracy: {accuracy:.2%}  ({correct}/{total} correct)")
