In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error
import sys

In [3]:
# Add the root directory to Python path


root_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))  # Use os.getcwd() for Jupyter
sys.path.insert(0, root_dir)
from Implementation.Utils.__utils__ import timer

In [4]:
def reduce_mem_usage(df, use_float16=False):
    """ Reduce memory usage by converting columns to optimal dtypes. """
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16:
                    df[col] = df[col].astype(np.float32)  # Avoid overflow by keeping float32
                else:
                    df[col] = df[col].astype(np.float64)  # More precision if needed
    return df


In [5]:
# Load processed data
train = pd.read_feather("../Processed_Data/train_processed.feather")
test = pd.read_feather("../Processed_Data/test_processed.feather")

# Encode categorical variables
for col in train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    all_values = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(all_values)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Reduce memory usage safely
train = reduce_mem_usage(train, use_float16=True)
test = reduce_mem_usage(test, use_float16=True)
gc.collect()

# Prepare dataset
X = train.drop(columns=["meter_reading"])
y = np.log1p(train["meter_reading"])  # Log transform target

# Define K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define your parameters
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}
# Create callback functions
callbacks = [
    lgb.early_stopping(stopping_rounds=50),
    lgb.log_evaluation(period=25)
]

# Create directory to store models
os.makedirs("./Saved_Model", exist_ok=True)

rmse_scores = []

### Model Training and Evaluation

In [6]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training Fold {fold + 1}...")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    # Train model
    # Train the model
    with timer("Training the LGB model"):
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[train_data, val_data],
            callbacks=callbacks
        )
    with timer("Prediction and Model Evaluation"):
        # Predict and evaluate
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        rmse_scores.append(rmse)
        print(f"Fold {fold + 1} RMSE: {rmse:.4f}")

    with timer("Saving the model"):
        # Save model
        model.save_model(f"./Saved_Model/lgbm_model_fold{fold + 1}.txt")

        del X_train, X_val, y_train, y_val, train_data, val_data
        gc.collect()

Training Fold 1...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.184916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6583
[LightGBM] [Info] Number of data points in the train set: 3314212, number of used features: 39
[LightGBM] [Info] Start training from score 3.925595
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.802075	valid_1's rmse: 0.805881
[50]	training's rmse: 0.492071	valid_1's rmse: 0.501103


KeyboardInterrupt: 

In [None]:
# Print average RMSE across folds
avg_rmse = np.mean(rmse_scores)
print(f"\nAverage RMSE across folds: {avg_rmse:.4f}")

print("Training complete! Models saved.")