In [97]:
import numpy as np
import random
import os

# fix random seed
SEED = 42  

# Python built-in random
random.seed(SEED)

# NumPy random
np.random.seed(SEED) # scikit-learn sẽ tự dùng numpy seed ngầm bên trong)

# Nếu dùng PyTorch hoặc TensorFlow, có thể thêm tương ứng:
# torch.manual_seed(SEED)
# tf.random.set_seed(SEED)

# Đảm bảo các process con (nếu có) cũng dùng cùng seed
os.environ['PYTHONHASHSEED'] = str(SEED)


In [98]:
import joblib
import pandas as pd
import optuna

from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import TimeSeriesSplit

from src import data_preprocessing as dp
from src import pipeline as pl
from src import feature_engineering as fe
from src.model_evaluation import evaluate

In [99]:
# 1. Load raw Data
df = dp.load_data(r"data\raw data\Hanoi Daily 10 years.csv")

 Loaded data with shape: (3660, 33)


In [100]:
# 2. basic preprocessing for all data set
# đang để là có drop description 
df = dp.basic_preprocessing(df=df)
print("Shape:", df.shape)

Dropped column: 'description'
Shape: (3660, 31)


In [101]:
df.head(1)

Unnamed: 0_level_0,name,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,solarradiation,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,icon,stations
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-20,Hanoi,33.0,26.7,29.3,41.7,30.5,35.4,25.7,81.5,1.4,...,142.1,12.2,5,,2015-09-20 05:44:39,2015-09-20 17:55:33,0.23,"Rain, Partially cloudy",rain,"48820099999,48823099999,48825099999,4883109999..."


In [102]:
# 3. chia train, val, test (tỉ lệ 70/15/15)
target_col = 'temp'
train_size = 0.7
val_size = 0.15
n = len(df)

train_df = df.iloc[:int(train_size*n)]
val_df = df.iloc[int(train_size*n):int((train_size+val_size)*n)]
test_df = df.iloc[int((train_size+val_size)*n):]

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Chia X, y riêng biệt
X_train, y_train = train_df.drop(columns=[target_col]), train_df[target_col]
X_val, y_val = val_df.drop(columns=[target_col]), val_df[target_col]
X_test, y_test = test_df.drop(columns=[target_col]), test_df[target_col]

print(f"Train: {X_train.shape, y_train.shape}, Val: {X_val.shape, y_val.shape}, Test: {X_test.shape, y_test.shape}")




Train: 2562, Val: 549, Test: 549
Train: ((2562, 30), (2562,)), Val: ((549, 30), (549,)), Test: ((549, 30), (549,))


In [103]:
# 4. Fit và transform trên preprocessing pipeline
pipeline1 = pl.build_preprocessing_pipeline()

# chỉ fit trên train (học thông số từ train)
pipeline1.fit(X_train, y_train)

# transform 
X_train_processed = pipeline1.transform(X_train)
X_val_processed = pipeline1.transform(X_val)
X_test_processed = pipeline1.transform(X_test)



In [104]:
X_train_processed.head(1)

Unnamed: 0_level_0,tempmax,tempmin,dew,humidity,precip,precipprob,precipcover,windgust,windspeed,winddir,...,cloudcover,visibility,solarradiation,moonphase,main_station,sunrise,sunset,conditions,icon,stations
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-20,33.0,26.7,25.7,81.5,1.4,100,4.17,19.4,15.7,94.8,...,72.2,7.9,142.1,0.23,3,2015-09-20 05:44:39,2015-09-20 17:55:33,5,3,"48820099999,48823099999,48825099999,4883109999..."


In [112]:
X_train_processed.columns

Index(['tempmax', 'tempmin', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'windgust', 'windspeed', 'winddir', 'sealevelpressure',
       'cloudcover', 'visibility', 'solarradiation', 'moonphase',
       'main_station', 'sunrise', 'sunset', 'conditions', 'icon', 'stations',
       'year', 'month', 'day', 'dayofyear', 'weekday', 'is_weekend',
       'month_sin', 'month_cos', 'dayofyear_sin', 'dayofyear_cos',
       'day_length'],
      dtype='object')

In [113]:
len(X_train_processed.columns)

32

In [114]:
# Kiểm tra kích thước
print(X_train_processed.shape, X_val_processed.shape, X_test_processed.shape)

(2562, 32) (549, 32) (549, 32)


In [115]:
# create time-extract feature
X_extract_time = fe.create_date_features(X_train_processed)
X_Val_extract_time = fe.create_date_features(X_val_processed)
X_test_extract_time = fe.create_date_features(X_test_processed)
print(X_extract_time.shape)


(2562, 32)


In [116]:
X_extract_time.columns

Index(['tempmax', 'tempmin', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'windgust', 'windspeed', 'winddir', 'sealevelpressure',
       'cloudcover', 'visibility', 'solarradiation', 'moonphase',
       'main_station', 'sunrise', 'sunset', 'conditions', 'icon', 'stations',
       'year', 'month', 'day', 'dayofyear', 'weekday', 'is_weekend',
       'month_sin', 'month_cos', 'dayofyear_sin', 'dayofyear_cos',
       'day_length'],
      dtype='object')

In [109]:
# create lag, rolling feature
## chỗ này đang hơi vấn đề data leak, cứ bỏ qua tạm 

In [110]:
# chỗ này để tạo thêm feature bổ sung sau

### Random Forest

In [111]:
# 1. fit 
model1 = RandomForestRegressor(n_estimators = 100, min_samples_split=2, min_samples_leaf=1,min_impurity_decrease=0, 
                               max_depth= 7, max_features= None, bootstrap= True, oob_score= False, 
                               max_leaf_nodes= None, max_samples= None
                               )
model1.fit(X_extract_time, y_train) # sau này đầy đủ feature engineering thì thay X đầy đủ feature vào

ValueError: could not convert string to float: '48820099999,48823099999,48825099999,48831099999,VVNB'

In [None]:
# 2. predict
y_pred_val = model1.predict(X_Val_extract_time) # sau này đầy đủ feature engineering thì thay X đầy đủ feature vào
y_pred_test = model1.predict(X_test_extract_time)

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
mae = mean_absolute_error(y_val, y_pred_val)


In [None]:
# 3. Evaluate on validation
val_metrics = evaluate(y_val, y_pred_val)
print("Validation metrics:", val_metrics)

# 4. Evaluate on test
test_metrics = evaluate(y_test, y_pred_test)
print("Test metrics:", test_metrics)

# 5. Save model 
model_path = r'models/RF_model.pkl' 
joblib.dump(model1, model_path)
print(f"RandomForest model saved to {model_path}")

### Extra Tree

In [None]:
model2 = ExtraTreesClassifier(
    n_estimators=100,    # số cây
    max_features='sqrt', # số feature được xem khi split
    random_state=42
)

### Gradient Boosting (tại model này chưa học, tìm hiểu đã rồi rảnh thì làm random 1 trong 3 đứa :))))