In [2]:

import pandas as pd
import numpy as np
import random

df = pd.read_csv("dehradun_weather_processed.csv")
condition = df["condition_text"].tolist()

states = list(set(condition))
transition_counts = {s: {s2: 0 for s2 in states} for s in states}

for today, tomorrow in zip(condition[:-1], condition[1:]):
    transition_counts[today][tomorrow] += 1
transition_matrix = {}
for s in states:
    total = sum(transition_counts[s].values())
    transition_matrix[s] = {s2: transition_counts[s][s2]/total if total>0 else 0 for s2 in states}

print("\nSample Markov Chain Transition Matrix (first 5 states):")
for s in list(transition_matrix.keys())[:5]:
    print(f"{s}: {transition_matrix[s]}")
mean_temp = np.mean(df["temperature_celsius"])
std_temp = np.std(df["temperature_celsius"])
mean_humidity = np.mean(df["humidity"])
std_humidity = np.std(df["humidity"])
lam_rain_events = max(0.5, np.mean(df["wind_kph"]))
storm_prob = 0.1

num_days = 30
weather_data = []
current_state = random.choice(states)

for day in range(num_days):
    temp = np.random.normal(mean_temp, std_temp)
    hum = np.random.normal(mean_humidity, std_humidity)
    rain = np.random.poisson(lam_rain_events)
    thunder = np.random.binomial(1, storm_prob)
    
    weather_data.append({
        "day": day+1,
        "condition": current_state,
        "temperature": round(temp,1),
        "humidity": round(hum,1),
        "rain": rain,
        "thunder": bool(thunder)
    })
    
    next_states = list(transition_matrix[current_state].keys())
    probs = list(transition_matrix[current_state].values())
    current_state = random.choices(next_states, weights=probs, k=1)[0]

sim_df = pd.DataFrame(weather_data)
print("\nSimulated Weather Data (first 10 days):")
print(sim_df.head(10))


Sample Markov Chain Transition Matrix (first 5 states):
Clear: {'Clear': 0.8181818181818182, 'Partially cloudy': 0.18181818181818182}
Partially cloudy: {'Clear': 0.6666666666666666, 'Partially cloudy': 0.3333333333333333}

Simulated Weather Data (first 10 days):
   day         condition  temperature  humidity  rain  thunder
0    1             Clear         21.1      55.4     7    False
1    2             Clear         20.4      51.4     3    False
2    3  Partially cloudy         20.9      51.3     3    False
3    4             Clear         19.9      55.6     6    False
4    5             Clear         20.3      61.2     5    False
5    6             Clear         20.3      51.3     2    False
6    7             Clear         20.1      59.7     4    False
7    8  Partially cloudy         20.9      66.5     5    False
8    9  Partially cloudy         19.6      54.4     5     True
9   10  Partially cloudy         21.0      55.9     4    False


In [3]:

total_rain_days = (sim_df['rain'] > 0).sum()

total_rainfall = sim_df['rain'].sum()

if total_rain_days > 0:
    avg_rain_rainy_days = sim_df.loc[sim_df['rain'] > 0, 'rain'].mean()
else:
    avg_rain_rainy_days = 0

rain_probability = total_rain_days / len(sim_df)

longest_streak = 0
current_streak = 0
for rain in sim_df['rain']:
    if rain > 0:
        current_streak += 1
        longest_streak = max(longest_streak, current_streak)
    else:
        current_streak = 0

thunder_days = sim_df['thunder'].sum()

print("🌧 Rain Event Summary:")
print(f"Total Rain Days       : {total_rain_days}")
print(f"Total Rainfall        : {total_rainfall}")
print(f"Average Rain (Rainy Days) : {avg_rain_rainy_days:.2f}")
print(f"Rain Probability      : {rain_probability:.2%}")
print(f"Longest Rainy Streak  : {longest_streak} days")
print(f"Thunderstorm Days     : {thunder_days}")


🌧 Rain Event Summary:
Total Rain Days       : 29
Total Rainfall        : 129
Average Rain (Rainy Days) : 4.45
Rain Probability      : 96.67%
Longest Rainy Streak  : 17 days
Thunderstorm Days     : 3
