In [24]:
import pandas as pd
import os
df = pd.read_csv("../data/processed/training_data_p1.csv")
df["date_time"] = pd.to_datetime(df["date_time"])
df

Unnamed: 0,date_time,temperature,humidity,pressure,wind_speed,wind_direction,precipitation,cloud_coverage,weather_condition,city,latitude,longitude,country,hour,day,weekday,month
0,2024-05-16 13:15:00,26.6,24,1012.0,13.3,338,0.00,30,partly cloudy,Kabul,34.5200,69.1800,Afghanistan,13,16,3,5
1,2024-05-16 10:45:00,19.0,94,1012.0,11.2,320,0.10,75,partly cloudy,Tirana,41.3300,19.8200,Albania,10,16,3,5
2,2024-05-16 09:45:00,23.0,29,1011.0,15.1,280,0.00,0,sunny,Algiers,36.7600,3.0500,Algeria,9,16,3,5
3,2024-05-16 10:45:00,6.3,61,1007.0,11.9,215,0.30,100,light drizzle,Andorra La Vella,42.5000,1.5200,Andorra,10,16,3,5
4,2024-05-16 09:45:00,26.0,89,1011.0,13.0,150,0.00,50,partly cloudy,Luanda,-8.8400,13.2300,Angola,9,16,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64113,2025-04-11 05:15:00,24.0,89,1012.0,5.8,160,0.01,0,clear,Caracas,10.5000,-66.9167,Venezuela,5,11,4,4
64114,2025-04-11 16:30:00,27.3,79,1003.0,24.8,139,0.00,75,mist,Hanoi,21.0333,105.8500,Vietnam,16,11,4,4
64115,2025-04-11 12:15:00,23.4,16,1011.0,5.4,237,0.00,0,sunny,Sanaa,15.3547,44.2067,Yemen,12,11,4,4
64116,2025-04-11 11:15:00,26.2,56,1013.0,18.4,89,0.12,89,patchy rain nearby,Lusaka,-15.4167,28.2833,Zambia,11,11,4,4


In [25]:
df.dtypes

date_time            datetime64[ns]
temperature                 float64
humidity                      int64
pressure                    float64
wind_speed                  float64
wind_direction                int64
precipitation               float64
cloud_coverage                int64
weather_condition            object
city                         object
latitude                    float64
longitude                   float64
country                      object
hour                          int64
day                           int64
weekday                       int64
month                         int64
dtype: object

In [26]:
reg_targets = ["temperature", "precipitation"]
clf_target = ["weather_condition"]
feature_cols = [
    "humidity", "pressure", "wind_speed", "wind_direction", "cloud_coverage",
    "latitude", "longitude", "hour", "day", "weekday", "month"
]

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X = df[feature_cols]
y_reg = df[reg_targets]
y_clf = df[clf_target]

In [29]:
y_reg

Unnamed: 0,temperature,precipitation
0,26.6,0.00
1,19.0,0.10
2,23.0,0.00
3,6.3,0.30
4,26.0,0.00
...,...,...
64113,24.0,0.01
64114,27.3,0.00
64115,23.4,0.00
64116,26.2,0.12


In [30]:
y_clf

Unnamed: 0,weather_condition
0,partly cloudy
1,partly cloudy
2,sunny
3,light drizzle
4,partly cloudy
...,...
64113,clear
64114,mist
64115,sunny
64116,patchy rain nearby


In [31]:
from sklearn.preprocessing import LabelEncoder

df_copy = df.copy()
le = LabelEncoder()
df["weather_condition_enc"] = le.fit_transform(df["weather_condition"])

weather_condition_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [32]:
weather_condition_mapping

{'blizzard': np.int64(0),
 'blowing snow': np.int64(1),
 'clear': np.int64(2),
 'cloudy': np.int64(3),
 'fog': np.int64(4),
 'freezing drizzle': np.int64(5),
 'freezing fog': np.int64(6),
 'heavy rain': np.int64(7),
 'heavy rain at times': np.int64(8),
 'heavy snow': np.int64(9),
 'light drizzle': np.int64(10),
 'light freezing rain': np.int64(11),
 'light rain': np.int64(12),
 'light rain shower': np.int64(13),
 'light sleet': np.int64(14),
 'light sleet showers': np.int64(15),
 'light snow': np.int64(16),
 'light snow showers': np.int64(17),
 'mist': np.int64(18),
 'moderate or heavy rain in area with thunder': np.int64(19),
 'moderate or heavy rain shower': np.int64(20),
 'moderate or heavy rain with thunder': np.int64(21),
 'moderate or heavy sleet': np.int64(22),
 'moderate or heavy snow in area with thunder': np.int64(23),
 'moderate or heavy snow showers': np.int64(24),
 'moderate rain': np.int64(25),
 'moderate rain at times': np.int64(26),
 'moderate snow': np.int64(27),
 'ove

In [33]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

In [34]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42)

In [36]:
os.makedirs("../data/processed", exist_ok=True)

# Save regression splits
X_train_reg.to_csv("../data/processed/reg/X_train_reg.csv", index=False)
X_test_reg.to_csv("../data/processed/reg/X_test_reg.csv", index=False)
y_train_reg.to_csv("../data/processed/reg/y_train_reg.csv", index=False)
y_test_reg.to_csv("../data/processed/reg/y_test_reg.csv", index=False)

# Save classification splits
X_train_clf.to_csv("../data/processed/clf/X_train_clf.csv", index=False)
X_test_clf.to_csv("../data/processed/clf/X_test_clf.csv", index=False)
y_train_clf.to_csv("../data/processed/clf/y_train_clf.csv", index=False)
y_test_clf.to_csv("../data/processed/clf/y_test_clf.csv", index=False)

print("✅ All regression and classification splits saved under ../data/processed/")


✅ All regression and classification splits saved under ../data/processed/
