In [1]:
import pandas as pd

In [2]:
# Step 1: Load dataset
df = pd.read_csv("../../data/cleaned_weather.csv", encoding='utf-8')

In [3]:
df.head()

Unnamed: 0,location_id,weather_code (wmo code),temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),apparent_temperature_mean (°C),daylight_duration (s),sunshine_duration (s),...,et0_fao_evapotranspiration (mm),latitude,longitude,elevation,year,month,day,sunrise_hour,sunset_hour,daylight_hours
0,0,1,30.1,22.6,26.0,34.5,25.0,29.0,42220.2,38905.73,...,4.61,6.924429,79.90725,4,2010,1,1,6,18,12
1,0,51,30.1,23.7,26.3,33.9,26.1,29.7,42225.71,37451.01,...,3.91,6.924429,79.90725,4,2010,2,1,6,18,12
2,0,51,29.6,23.1,26.0,34.5,26.2,29.9,42231.68,33176.43,...,3.66,6.924429,79.90725,4,2010,3,1,6,18,12
3,0,2,28.9,23.1,25.7,31.7,26.1,28.4,42238.11,38289.2,...,3.75,6.924429,79.90725,4,2010,4,1,6,18,12
4,0,1,28.1,21.3,24.6,30.0,22.9,26.2,42244.99,39113.82,...,5.0,6.924429,79.90725,4,2010,5,1,6,18,12


In [4]:
# Step 2: Rename columns for easier use
df.rename(columns={
    'temperature_2m_mean (°C)': 'temperature',
    'precipitation_sum (mm)': 'precipitation',
    'rain_sum (mm)': 'rainfall',
    'wind_speed_10m_max (km/h)': 'wind_speed',
    'date': 'date'  # Only renames if 'date' exists already
}, inplace=True)

In [5]:
# Step 3: Combine year, month, day into a datetime column
if 'date' not in df.columns:
    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

In [6]:
# Step 4: Drop duplicate records
df.drop_duplicates(inplace=True)

In [7]:
# Step 5: Drop rows with missing values in important target columns
essential_cols = ['temperature', 'rainfall', 'wind_speed', 'precipitation']
df.dropna(subset=essential_cols, inplace=True)

In [8]:
#  Step 6: Fill remaining missing values
df.ffill(inplace=True)
df.bfill(inplace=True)

In [9]:
# Step 7: Handle invalid/outlier values
# Replace negative rainfall/precipitation values with 0
df['rainfall'] = df['rainfall'].apply(lambda x: 0 if x < 0 else x)
df['precipitation'] = df['precipitation'].apply(lambda x: 0 if x < 0 else x)

In [10]:
# Optional: Remove unrealistic temperatures
df = df[(df['temperature'] > -50) & (df['temperature'] < 60)]

In [11]:
# Optional: Cast location_id and weather_code to category
if 'location_id' in df.columns:
    df['location_id'] = df['location_id'].astype('category')
if 'weather_code' in df.columns:
    df['weather_code'] = df['weather_code'].astype('category')

In [12]:
# === Step 8: Save cleaned dataset ===
df.to_csv("cleaned_data.csv", index=False)
print("Step 4: Data Cleaning complete.")
print(f"Shape after cleaning: {df.shape}")
print("Date range:", df['date'].min(), "to", df['date'].max())

Step 4: Data Cleaning complete.
Shape after cleaning: (142371, 28)
Date range: 2010-01-01 00:00:00 to 2024-12-05 00:00:00


In [None]:




# === Step 6: Fill remaining missing values ===
df.ffill(inplace=True)
df.bfill(inplace=True)

# === Step 7: Handle invalid/outlier values ===

# Replace negative rainfall/precipitation values with 0
df['rainfall'] = df['rainfall'].apply(lambda x: 0 if x < 0 else x)
df['precipitation'] = df['precipitation'].apply(lambda x: 0 if x < 0 else x)

# Optional: Remove unrealistic temperatures
df = df[(df['temperature'] > -50) & (df['temperature'] < 60)]

# Optional: Cast location_id and weather_code to category
if 'location_id' in df.columns:
    df['location_id'] = df['location_id'].astype('category')
if 'weather_code' in df.columns:
    df['weather_code'] = df['weather_code'].astype('category')

# === Step 8: Save cleaned dataset ===
df.to_csv("cleaned_data.csv", index=False)

# === Summary ===
print("✅ Step 4: Data Cleaning complete.")
print(f"🔢 Shape after cleaning: {df.shape}")
print("📅 Date range:", df['date'].min(), "to", df['date'].max())


✅ Step 4: Data Cleaning complete.
🔢 Shape after cleaning: (142371, 28)
📅 Date range: 2010-01-01 00:00:00 to 2024-12-05 00:00:00
