In [226]:

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Read CSV
df = pd.read_csv("../data/weatherHistory.csv")

# Strip column names to remove extra spaces
df.columns = df.columns.str.strip()

# Create thunderstorm column (synthetic)
df['thunderstorm'] = np.where(
    (df['Humidity'] > 0.85) & 
    (df['Wind Speed (km/h)'] > 25) & 
    (df['Pressure (millibars)'] < 1005),
    1, 0
)

# Drop unnecessary columns
df = df.drop(columns=['Summary', 'Daily Summary', 'Loud Cover'], errors='ignore')

# Fill missing values in 'Precip Type' with mode
if 'Precip Type' in df.columns:
    df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)
    # One-hot encode
    df = pd.get_dummies(df, columns=['Precip Type'], drop_first=True)

# Display basic info
print(df.columns.tolist())
print(df.info())

# Quick check of thunderstorm counts
print(df['thunderstorm'].value_counts())



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





['Formatted Date', 'Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)', 'thunderstorm', 'Precip Type_snow']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Temperature (C)           96453 non-null  float64
 2   Apparent Temperature (C)  96453 non-null  float64
 3   Humidity                  96453 non-null  float64
 4   Wind Speed (km/h)         96453 non-null  float64
 5   Wind Bearing (degrees)    96453 non-null  float64
 6   Visibility (km)           96453 non-null  float64
 7   Pressure (millibars)      96453 non-null  float64
 8   thunderstorm              96453 non-null  int64  
 9   Precip Type_snow          96453 non-null  bool   
dtypes: bool(1), float64(7)

In [227]:
 (df['thunderstorm'] == 1).sum()

np.int64(248)

In [228]:
# Make sure year column exists
df['year'] = df['Formatted Date'].dt.year

# Train: 2006-2012 (inclusive)
train_df = df[(df['year'] >= 2006) & (df['year'] <= 2012)]

# Validation: 2012-2014
val_df = df[(df['year'] > 2012) & (df['year'] <= 2014)]

# Test: 2014-2016
test_df = df[(df['year'] > 2014) & (df['year'] <= 2016)]

# Quick checks
print("Train:", train_df.shape)
print("Validation:", val_df.shape)
print("Test:", test_df.shape)


AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
# Strip column names to avoid hidden spaces
df.columns = df.columns.str.strip()

# Check the exact column name
print(df.columns)

# Convert 'Formatted Date' to datetime
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], errors='coerce')

# Optional: check if any rows failed conversion
print(df['Formatted Date'].isna().sum(), "rows could not be converted")

# Now extract features
df['year'] = df['Formatted Date'].dt.year
df['month'] = df['Formatted Date'].dt.month
df['day'] = df['Formatted Date'].dt.day
df['hour'] = df['Formatted Date'].dt.hour

# Quick check
df[['Formatted Date', 'year', 'month', 'day', 'hour']].head()
df

In [None]:
fig=px.histogram(df,x="year",nbins=11)
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
# Shift targets to represent "next hour"
train_df['Wind_Speed_next_hour'] = df['Wind Speed (km/h)'].shift(-1)
train_df['Thunderstorm_next_hour'] = df['thunderstorm'].shift(-1)
test_df['Wind_Speed_next_hour'] = df['Wind Speed (km/h)'].shift(-1)
test_df['Thunderstorm_next_hour'] = df['thunderstorm'].shift(-1)
val_df['Wind_Speed_next_hour'] = df['Wind Speed (km/h)'].shift(-1)
val_df['Thunderstorm_next_hour'] = df['thunderstorm'].shift(-1)


In [None]:
test_df

In [None]:
# Previous hour values
train_df['Wind_Speed_prev_1h'] = train_df['Wind Speed (km/h)'].shift(1)
train_df['Temperature_prev_1h'] = train_df['Temperature (C)'].shift(1)
train_df['Humidity_prev_1h'] = train_df['Humidity'].shift(1)
train_df['Pressure_prev_1h'] = train_df['Pressure (millibars)'].shift(1)
train_df['Thunderstorm_prev_1h'] = train_df['thunderstorm'].shift(1)
# Rolling mean of last 3 hours
train_df['Wind_Speed_roll3h'] = train_df['Wind Speed (km/h)'].shift(1).rolling(3).mean()
train_df['Temperature_roll3h'] = train_df['Temperature (C)'].shift(1).rolling(3).mean()
# Hour
train_df['hour_sin'] = np.sin(2 * np.pi * train_df['hour']/24)
train_df['hour_cos'] = np.cos(2 * np.pi * train_df['hour']/24)

# Month
train_df['month_sin'] = np.sin(2 * np.pi * train_df['month']/12)
train_df['month_cos'] = np.cos(2 * np.pi * train_df['month']/12)
train_df = train_df.dropna().reset_index(drop=True)
# Previous hour values
test_df['Wind_Speed_prev_1h'] = test_df['Wind Speed (km/h)'].shift(1)
test_df['Temperature_prev_1h'] = test_df['Temperature (C)'].shift(1)
test_df['Humidity_prev_1h'] = test_df['Humidity'].shift(1)
test_df['Pressure_prev_1h'] = test_df['Pressure (millibars)'].shift(1)
test_df['Thunderstorm_prev_1h'] = test_df['thunderstorm'].shift(1)
# Rolling mean of last 3 hours
test_df['Wind_Speed_roll3h'] = test_df['Wind Speed (km/h)'].shift(1).rolling(3).mean()
test_df['Temperature_roll3h'] = test_df['Temperature (C)'].shift(1).rolling(3).mean()
# Hour
test_df['hour_sin'] = np.sin(2 * np.pi * test_df['hour']/24)
test_df['hour_cos'] = np.cos(2 * np.pi * test_df['hour']/24)

# Month
test_df['month_sin'] = np.sin(2 * np.pi * test_df['month']/12)
test_df['month_cos'] = np.cos(2 * np.pi * test_df['month']/12)
test_df = test_df.dropna().reset_index(drop=True)





# Previous hour values
val_df['Wind_Speed_prev_1h'] = val_df['Wind Speed (km/h)'].shift(1)
val_df['Temperature_prev_1h'] = val_df['Temperature (C)'].shift(1)
val_df['Humidity_prev_1h'] = val_df['Humidity'].shift(1)
val_df['Pressure_prev_1h'] = val_df['Pressure (millibars)'].shift(1)
val_df['Thunderstorm_prev_1h'] = val_df['thunderstorm'].shift(1)
# Rolling mean of last 3 hours
val_df['Wind_Speed_roll3h'] = val_df['Wind Speed (km/h)'].shift(1).rolling(3).mean()
val_df['Temperature_roll3h'] = val_df['Temperature (C)'].shift(1).rolling(3).mean()
# Hour
val_df['hour_sin'] = np.sin(2 * np.pi * val_df['hour']/24)
val_df['hour_cos'] = np.cos(2 * np.pi * val_df['hour']/24)

# Month
val_df['month_sin'] = np.sin(2 * np.pi * val_df['month']/12)
val_df['month_cos'] = np.cos(2 * np.pi * val_df['month']/12)
val_df = val_df.dropna().reset_index(drop=True)



In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
val_df.info()

In [None]:
numerical_features = [
    'Temperature (C)',
    'Apparent Temperature (C)',
    'Humidity',
    'Pressure (millibars)',
    'Visibility (km)',
    'Wind Bearing (degrees)',
    # Lag features
    'Wind_Speed_prev_1h',
    'Temperature_prev_1h',
    'Humidity_prev_1h',
    'Pressure_prev_1h',
    'Thunderstorm_prev_1h',
    # Rolling averages
    'Wind_Speed_roll3h',
    'Temperature_roll3h',
    # Cyclical time features
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos'
]
categorical_features = [
    'Precip Type_snow'  # already one-hot encoded
]
feature_columns = numerical_features + categorical_features
# For Wind Speed (Regression)
X_wind = train_df[feature_columns]
y_wind = train_df['Wind_Speed_next_hour']

# For Thunderstorm (Classification)
X_thunder = train_df[feature_columns]
y_thunder = train_df['Thunderstorm_next_hour']


In [None]:
# for wind
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# for thunder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

model_wind=RandomForestRegressor(n_estimators=150,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1)
model_wind.fit(X_wind,y_wind)


In [None]:
model_thunder=RandomForestClassifier(   n_estimators=150,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1)
model_thunder.fit(X_thunder,y_thunder)

In [None]:
val_df['Wind_Speed_next_hour']

In [None]:
pred_wind=model_wind.predict(val_df[feature_columns])
pred
pred_thunder=model_thunder.predict(val_df[feature_columns])

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
mean_squared_error(y_wind,pred_wind,squared=False)