In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# Loading Data
data = pd.read_csv('D:\Master_Folder\Data Science Course\Projects\StockMarket\stock_data\SUZLON.NS_2023-01-01_to_2024-11-21_ML.csv')
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
data.set_index('Date', inplace=True)
data['Next_1_day_close'] = data['Close'].shift(-1)
data.dropna(inplace=True)

  data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)


In [3]:
# Define features and target
features = ['Close', 'Upward_Downward_Probability', 'Temporal_Features', 'Cluster', 'Anomaly']
target = 'Next_1_day_close'

In [4]:
# Train Test Split
train_size = int(len(data) * 0.7)
train_data = data[:train_size]
test_data = data[train_size:]

X_train = train_data[features]
y_train = train_data[target]

X_test = test_data[features]
y_test = test_data[target]

In [5]:
# Scale the features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
def train_and_evaluate_model(model, param_grid, X_train, y_train, X_test, y_test, model_name): 
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = (np.abs((y_test - y_pred) / y_test).mean()) * 100
    
    print(f"\n--- {model_name} Results ---")
    print("Best Parameters:", grid_search.best_params_)
    print(f"R²: {r2:.4f}")
    print(f"MAPE: {mape:.2f}%")
    print(f"MSE: {mse:.4f}")
    return best_model

In [10]:
# XGBoost

xgb_param_grid = {'n_estimators': [50, 75, 100], 'learning_rate': [0.1, 0.2, 0.3, 1.0], 'max_depth': [3, 5, 7], 'subsample': [0.6, 0.8, 1.0]}
xgb_model = XGBRegressor(random_state=42)
best_xgb = train_and_evaluate_model(xgb_model, xgb_param_grid, X_train_scaled, y_train, X_test_scaled, y_test, 'XGBoost')

Fitting 3 folds for each of 108 candidates, totalling 324 fits

--- XGBoost Results ---
Best Parameters: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
R²: -2.0664
MAPE: 24.42%
MSE: 437.9451


In [12]:
# LightGBM
# lgbm_param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'num_leaves': [31, 50, 100],
#     'subsample': [0.6, 0.8, 1.0]
# }
# lgbm_model = LGBMRegressor(random_state=42)
# best_lgbm = train_and_evaluate_model(lgbm_model, lgbm_param_grid, X_train_scaled, y_train, X_test_scaled, y_test, "LightGBM")


In [13]:
# CatBoost
catboost_param_grid = {
    'iterations': [200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 8]
}
catboost_model = CatBoostRegressor(random_state=42, verbose=0)
best_catboost = train_and_evaluate_model(catboost_model, catboost_param_grid, X_train_scaled, y_train, X_test_scaled, y_test, "CatBoost")

Fitting 3 folds for each of 18 candidates, totalling 54 fits

--- CatBoost Results ---
Best Parameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.1}
R²: -4.4861
MAPE: 34.16%
MSE: 783.5160
