# Hanoi Weather Data - Data Processing

This notebook handles data cleaning, preprocessing, and preparation for feature engineering and modeling.

## Objectives
1. Load and clean the raw weather data
2. Handle missing values and outliers
3. Data type conversions and validation
4. Create base temporal features
5. Data quality checks and validation
6. Export cleaned data for feature engineering

## 1. Setup and Imports

In [None]:
# Import essential libraries for data processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
import os

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("✅ Libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")

## 2. Data Loading and Initial Cleaning

In [None]:
# Load raw data
df_raw = pd.read_csv('../data/raw/daily_data.csv')

print("📋 RAW DATA OVERVIEW:")
print("=" * 40)
print(f"Shape: {df_raw.shape}")
print(f"Date range: {df_raw['datetime'].min()} to {df_raw['datetime'].max()}")

# Check data types and missing values
print(f"\n🔍 DATA QUALITY CHECK:")
print(f"Missing values per column:")
missing_summary = df_raw.isnull().sum()
missing_summary = missing_summary[missing_summary > 0]
if len(missing_summary) > 0:
    for col, count in missing_summary.items():
        pct = (count / len(df_raw)) * 100
        print(f"  {col}: {count} ({pct:.1f}%)")
else:
    print("  ✅ No missing values found!")

print(f"\nDuplicate records: {df_raw.duplicated().sum()}")

# Data types
print(f"\n📊 DATA TYPES:")
for dtype in df_raw.dtypes.value_counts().index:
    cols = df_raw.select_dtypes(include=[dtype]).columns.tolist()
    print(f"  {dtype}: {len(cols)} columns")