In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle
import os

In [2]:
df = pd.read_csv("../data/oslobysykkel-2025-cleaned.csv")
df.head()

Unnamed: 0,duration,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,start_time,start_dayofweek
0,819,555,59.933703,10.75193,572,59.922269,10.67958,3.0,6.0
1,976,735,59.903213,10.767344,487,59.926929,10.776971,3.0,6.0
2,549,485,59.911453,10.776072,444,59.925265,10.750462,3.0,6.0
3,1095,387,59.914586,10.735453,2330,59.913233,10.749959,3.0,6.0
4,406,623,59.91508,10.730589,2337,59.915915,10.737835,3.0,6.0


In [3]:
df['log_duration'] = np.log1p(df['duration'])

In [4]:
# Xử lý tọa độ trạm
station_coords = df[['start_station_id', 'start_station_latitude', 'start_station_longitude']].drop_duplicates(subset=['start_station_id'])
coords_dict = station_coords.set_index('start_station_id').T.to_dict()

In [5]:
X = df.drop(['duration', 'log_duration'], axis=1)
y = df['log_duration']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Target Encoding
start_mean_map = y_train.groupby(X_train['start_station_id']).mean()
end_mean_map = y_train.groupby(X_train['end_station_id']).mean()

# Áp dụng Encoding và Xử lý cột
for data in [X_train, X_test]:
    # Tạo cột Encoding
    data['start_target_encoded'] = data['start_station_id'].map(start_mean_map)
    data['end_target_encoded'] = data['end_station_id'].map(end_mean_map)
    data.drop(['start_station_id', 'end_station_id'], axis=1, inplace=True)

In [8]:
# Lưu danh sách ID để Streamlit tạo Selectbox
valid_start_ids = sorted(start_mean_map.index.tolist())
valid_end_ids = sorted(end_mean_map.index.tolist())

In [9]:
# Huấn luyện và Lưu mô hình
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42)
}

model_folder = "../models"
if not os.path.exists(model_folder): 
    os.makedirs(model_folder)

for name, model in models.items():
    # Huấn luyện
    model.fit(X_train, y_train)
    
    # Đánh giá
    y_pred_log = model.predict(X_test)
    y_actual = np.expm1(y_test)
    y_pred = np.expm1(y_pred_log)
    
    metrics = {
        "MAE": mean_absolute_error(y_actual, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_actual, y_pred)),
        "R2": r2_score(y_actual, y_pred)
    }

    # Đóng gói package
    package = {
        "model": model,
        "features": X_train.columns.tolist(),
        "start_map": start_mean_map.to_dict(),
        "end_map": end_mean_map.to_dict(),
        "valid_starts": valid_start_ids,
        "valid_ends": valid_end_ids,
        "metrics": metrics,
        "coords_dict": coords_dict
    }
    
    # Lưu file pkl
    filename = f"{model_folder}/{name.replace(' ', '_').lower()}_package.pkl"
    with open(filename, "wb") as f:
        pickle.dump(package, f)
        
    print(f"Đã lưu {name} thành công! (MAE: {metrics['MAE']:.2f}s), (RMSE: {metrics['RMSE']:.2f}s), (R2: {metrics['R2']:.4f})")


Đã lưu Linear Regression thành công! (MAE: 344.55s), (RMSE: 660.47s), (R2: 0.0620)


Đã lưu Decision Tree thành công! (MAE: 282.66s), (RMSE: 607.29s), (R2: 0.2069)


Đã lưu Random Forest thành công! (MAE: 239.74s), (RMSE: 573.21s), (R2: 0.2935)
