In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import logging
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import zscore
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from datetime import timedelta

In [3]:
file_path_for_train_data = './rossmann-store-sales/train_data.csv'
file_path_for_test_data = './rossmann-store-sales/test_data.csv'
file_path_for_submission = './rossmann-store-sales/sample_submission.csv'

In [4]:
Data_for_train = pd.read_csv(file_path_for_train_data,low_memory=False)

In [5]:
Data_for_test = pd.read_csv(file_path_for_test_data,low_memory=False)

In [6]:
Data_for_train.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear',
       'IsHoliday'],
      dtype='object')

In [7]:
Data_for_train.isna().sum()

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
Year             0
Month            0
Day              0
WeekOfYear       0
IsHoliday        0
dtype: int64

In [8]:
Data_for_test.isna().sum()

Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
Year              0
Month             0
Day               0
WeekOfYear        0
IsHoliday         0
dtype: int64

In [9]:
# Handling missing values in 'Open' column (Test set)
Data_for_test["Open"] = Data_for_test["Open"].fillna(1) # Assuming stores are open if missing

In [10]:
Data_for_test.columns

Index(['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear', 'IsHoliday'],
      dtype='object')

In [11]:
Data_for_train.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear',
       'IsHoliday'],
      dtype='object')

In [12]:
# Convert 'Date' column to datetime format
Data_for_train["Date"] = pd.to_datetime(Data_for_train["Date"])
Data_for_test["Date"] = pd.to_datetime(Data_for_test["Date"], errors='coerce')

In [13]:
# Extract datetime features
def extract_date_features(df):
    df["Weekday"] = df["Date"].dt.weekday  # 0 = Monday, 6 = Sunday
    df["MonthStart"] = (df["Date"].dt.day <= 7).astype(int)
    df["MonthMid"] = ((df["Date"].dt.day > 7) & (df["Date"].dt.day <= 21)).astype(int)
    df["MonthEnd"] = (df["Date"].dt.day > 21).astype(int)
    return df

In [14]:
train_df = extract_date_features(Data_for_train)
test_df = extract_date_features(Data_for_test)

In [15]:
train_df.head(5)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,IsHoliday,Weekday,MonthStart,MonthMid,MonthEnd
0,1,5,2015-07-31,5263,555,1,1,0,1,2015,7,31,31,1,4,0,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1,2015,7,31,31,1,4,0,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1,2015,7,31,31,1,4,0,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1,2015,7,31,31,1,4,0,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1,2015,7,31,31,1,4,0,0,1


In [16]:
categorical_cols_train = [col for col in Data_for_train.select_dtypes(include=["object", "category"])]
numerical_cols_for_train = [col for col in Data_for_train.select_dtypes(include=["number"])]

In [17]:
categorical_cols_train,numerical_cols_for_train

(['StateHoliday'],
 ['Store',
  'DayOfWeek',
  'Sales',
  'Customers',
  'Open',
  'Promo',
  'SchoolHoliday',
  'Year',
  'Month',
  'Day',
  'WeekOfYear',
  'IsHoliday',
  'Weekday',
  'MonthStart',
  'MonthMid',
  'MonthEnd'])

In [18]:
# Remove "Sales" from the list
numerical_cols_for_train = [col for col in numerical_cols_for_train if col != "Sales"]

print("Updated Numerical Columns:", numerical_cols_for_train)

Updated Numerical Columns: ['Store', 'DayOfWeek', 'Customers', 'Open', 'Promo', 'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear', 'IsHoliday', 'Weekday', 'MonthStart', 'MonthMid', 'MonthEnd']


In [19]:
categorical_cols_test = [col for col in Data_for_test.select_dtypes(include=["object", "category"])]
numerical_cols_for_test = [col for col in Data_for_test.select_dtypes(include=["number"])]

In [20]:
categorical_cols_test,numerical_cols_for_test

(['StateHoliday'],
 ['Id',
  'Store',
  'DayOfWeek',
  'Open',
  'Promo',
  'SchoolHoliday',
  'Year',
  'Month',
  'Day',
  'WeekOfYear',
  'IsHoliday',
  'Weekday',
  'MonthStart',
  'MonthMid',
  'MonthEnd'])

In [21]:
Data_for_test.columns

Index(['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear', 'IsHoliday',
       'Weekday', 'MonthStart', 'MonthMid', 'MonthEnd'],
      dtype='object')

In [22]:
test_df.columns

Index(['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear', 'IsHoliday',
       'Weekday', 'MonthStart', 'MonthMid', 'MonthEnd'],
      dtype='object')

Code to get the future dates for next 6 weeks prediction...

In [23]:
Data_for_train.dtypes

Store                     int64
DayOfWeek                 int64
Date             datetime64[ns]
Sales                     int64
Customers                 int64
Open                      int64
Promo                     int64
StateHoliday             object
SchoolHoliday             int64
Year                      int64
Month                     int64
Day                       int64
WeekOfYear                int64
IsHoliday                 int64
Weekday                   int32
MonthStart                int64
MonthMid                  int64
MonthEnd                  int64
dtype: object

In [24]:
Data_for_train.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear',
       'IsHoliday', 'Weekday', 'MonthStart', 'MonthMid', 'MonthEnd'],
      dtype='object')

In [25]:
# Get the last available date from training data
last_date = Data_for_train["Date"].max()

In [26]:
last_date

Timestamp('2015-07-31 00:00:00')

In [27]:
# Generate future dates for the next 6 weeks (42 days)
future_dates = [last_date + timedelta(days=i) for i in range(1, 43)]

In [28]:
future_dates

[Timestamp('2015-08-01 00:00:00'),
 Timestamp('2015-08-02 00:00:00'),
 Timestamp('2015-08-03 00:00:00'),
 Timestamp('2015-08-04 00:00:00'),
 Timestamp('2015-08-05 00:00:00'),
 Timestamp('2015-08-06 00:00:00'),
 Timestamp('2015-08-07 00:00:00'),
 Timestamp('2015-08-08 00:00:00'),
 Timestamp('2015-08-09 00:00:00'),
 Timestamp('2015-08-10 00:00:00'),
 Timestamp('2015-08-11 00:00:00'),
 Timestamp('2015-08-12 00:00:00'),
 Timestamp('2015-08-13 00:00:00'),
 Timestamp('2015-08-14 00:00:00'),
 Timestamp('2015-08-15 00:00:00'),
 Timestamp('2015-08-16 00:00:00'),
 Timestamp('2015-08-17 00:00:00'),
 Timestamp('2015-08-18 00:00:00'),
 Timestamp('2015-08-19 00:00:00'),
 Timestamp('2015-08-20 00:00:00'),
 Timestamp('2015-08-21 00:00:00'),
 Timestamp('2015-08-22 00:00:00'),
 Timestamp('2015-08-23 00:00:00'),
 Timestamp('2015-08-24 00:00:00'),
 Timestamp('2015-08-25 00:00:00'),
 Timestamp('2015-08-26 00:00:00'),
 Timestamp('2015-08-27 00:00:00'),
 Timestamp('2015-08-28 00:00:00'),
 Timestamp('2015-08-

In [29]:
# Create future dataset with only the columns in test data
future_test_data = []
for store in Data_for_train["Store"].unique():
    for date in future_dates:
        future_test_data.append([store, date.year, date.month, date.day, date])

In [30]:
# Convert to DataFrame
future_test_df = pd.DataFrame(future_test_data, columns=["Store", "Year", "Month", "Day", "Date"])

In [31]:
future_test_df.head(5)

Unnamed: 0,Store,Year,Month,Day,Date
0,1,2015,8,1,2015-08-01
1,1,2015,8,2,2015-08-02
2,1,2015,8,3,2015-08-03
3,1,2015,8,4,2015-08-04
4,1,2015,8,5,2015-08-05


In [32]:
# Ensure 'Date' is in proper format
future_test_df["Date"] = pd.to_datetime(future_test_df["Date"])

In [33]:
# Compute missing columns
future_test_df["Id"] = range(1, len(future_test_df) + 1)  # Assign unique ID
future_test_df["DayOfWeek"] = future_test_df["Date"].dt.weekday + 1  # Monday=1, Sunday=7

future_test_df["Open"] = 1  # Assume stores are open unless stated otherwise
future_test_df["Promo"] = 0  # Assume no promotion unless mapped
future_test_df["StateHoliday"] = "0"  # Assume no state holiday (change if dataset is available)
future_test_df["SchoolHoliday"] = 0  # Assume no school holiday (change if dataset is available)

future_test_df["WeekOfYear"] = future_test_df["Date"].dt.isocalendar().week  # ISO Week
future_test_df["Weekday"] = future_test_df["Date"].dt.weekday  # 0=Monday, 6=Sunday
future_test_df["MonthStart"] = (future_test_df["Day"] == 1).astype(int)
future_test_df["MonthMid"] = ((future_test_df["Day"] >= 10) & (future_test_df["Day"] <= 20)).astype(int)
future_test_df["MonthEnd"] = (future_test_df["Day"] >= 25).astype(int)

In [34]:
# Handle future holidays (assume no holidays for now, but can be mapped)
future_test_df["IsHoliday"] = 0  # Can be updated with real holiday data if available

In [35]:
future_test_df.shape

(46830, 17)

In [36]:
future_test_df.columns

Index(['Store', 'Year', 'Month', 'Day', 'Date', 'Id', 'DayOfWeek', 'Open',
       'Promo', 'StateHoliday', 'SchoolHoliday', 'WeekOfYear', 'Weekday',
       'MonthStart', 'MonthMid', 'MonthEnd', 'IsHoliday'],
      dtype='object')

In [37]:
# Ensure future dataset has the same columns as the test dataset
# Get available feature columns from test dataset
test_columns = Data_for_test.columns.tolist()

In [38]:
test_columns

['Id',
 'Store',
 'DayOfWeek',
 'Date',
 'Open',
 'Promo',
 'StateHoliday',
 'SchoolHoliday',
 'Year',
 'Month',
 'Day',
 'WeekOfYear',
 'IsHoliday',
 'Weekday',
 'MonthStart',
 'MonthMid',
 'MonthEnd']

In [39]:
future_test_df.columns

Index(['Store', 'Year', 'Month', 'Day', 'Date', 'Id', 'DayOfWeek', 'Open',
       'Promo', 'StateHoliday', 'SchoolHoliday', 'WeekOfYear', 'Weekday',
       'MonthStart', 'MonthMid', 'MonthEnd', 'IsHoliday'],
      dtype='object')

In [40]:
# Identify missing columns
missing_columns = [col for col in test_columns if col not in future_test_df.columns]

In [41]:
missing_columns

[]

In [42]:
future_test_df.head(5)

Unnamed: 0,Store,Year,Month,Day,Date,Id,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,WeekOfYear,Weekday,MonthStart,MonthMid,MonthEnd,IsHoliday
0,1,2015,8,1,2015-08-01,1,6,1,0,0,0,31,5,1,0,0,0
1,1,2015,8,2,2015-08-02,2,7,1,0,0,0,31,6,0,0,0,0
2,1,2015,8,3,2015-08-03,3,1,1,0,0,0,32,0,0,0,0,0
3,1,2015,8,4,2015-08-04,4,2,1,0,0,0,32,1,0,0,0,0
4,1,2015,8,5,2015-08-05,5,3,1,0,0,0,32,2,0,0,0,0


In [43]:
# Reorder columns to match test dataset structure
future_test_df = future_test_df[test_columns]

In [44]:
future_test_df.head(5)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,IsHoliday,Weekday,MonthStart,MonthMid,MonthEnd
0,1,1,6,2015-08-01,1,0,0,0,2015,8,1,31,0,5,1,0,0
1,2,1,7,2015-08-02,1,0,0,0,2015,8,2,31,0,6,0,0,0
2,3,1,1,2015-08-03,1,0,0,0,2015,8,3,32,0,0,0,0,0
3,4,1,2,2015-08-04,1,0,0,0,2015,8,4,32,0,1,0,0,0
4,5,1,3,2015-08-05,1,0,0,0,2015,8,5,32,0,2,0,0,0


In [45]:
future_test_df.columns

Index(['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear', 'IsHoliday',
       'Weekday', 'MonthStart', 'MonthMid', 'MonthEnd'],
      dtype='object')

In [46]:
future_test_df.isna().sum()

Id               0
Store            0
DayOfWeek        0
Date             0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
Year             0
Month            0
Day              0
WeekOfYear       0
IsHoliday        0
Weekday          0
MonthStart       0
MonthMid         0
MonthEnd         0
dtype: int64

In [47]:
# Save future dataset separately
future_test_df.to_csv("./rossmann-store-sales/future_6_weeks_data.csv", index=False)

In [48]:
future_test_df

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,IsHoliday,Weekday,MonthStart,MonthMid,MonthEnd
0,1,1,6,2015-08-01,1,0,0,0,2015,8,1,31,0,5,1,0,0
1,2,1,7,2015-08-02,1,0,0,0,2015,8,2,31,0,6,0,0,0
2,3,1,1,2015-08-03,1,0,0,0,2015,8,3,32,0,0,0,0,0
3,4,1,2,2015-08-04,1,0,0,0,2015,8,4,32,0,1,0,0,0
4,5,1,3,2015-08-05,1,0,0,0,2015,8,5,32,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46825,46826,1115,1,2015-09-07,1,0,0,0,2015,9,7,37,0,0,0,0,0
46826,46827,1115,2,2015-09-08,1,0,0,0,2015,9,8,37,0,1,0,0,0
46827,46828,1115,3,2015-09-09,1,0,0,0,2015,9,9,37,0,2,0,0,0
46828,46829,1115,4,2015-09-10,1,0,0,0,2015,9,10,37,0,3,0,1,0


In [49]:
print("Separate future dataset (6 weeks) created successfully, with correct categorical replacements!")

Separate future dataset (6 weeks) created successfully, with correct categorical replacements!


---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

*** Encode and Scale the Future Data ***

In [50]:
categorical_col_for_future = [col for col in future_test_df.select_dtypes(include=["object", "category"])]
numerical_cols_for_future = [col for col in future_test_df.select_dtypes(include=["number"])]

In [51]:
categorical_col_for_future,numerical_cols_for_future

(['StateHoliday'],
 ['Id',
  'Store',
  'DayOfWeek',
  'Open',
  'Promo',
  'SchoolHoliday',
  'Year',
  'Month',
  'Day',
  'WeekOfYear',
  'IsHoliday',
  'Weekday',
  'MonthStart',
  'MonthMid',
  'MonthEnd'])

*** Scale the numerical column ***

In [52]:
# Initialize StandardScaler
scaler = StandardScaler()
future_test_df[numerical_cols_for_future] = scaler.fit_transform(future_test_df[numerical_cols_for_future])

*** Encode the Categorical column ***

In [53]:
# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_col_for_future:
    le = LabelEncoder()
    future_test_df[col] = le.fit_transform(future_test_df[col].astype(str))
    label_encoders[col] = le

In [54]:
# Drop 'Date' column
future_test_df.drop(["Date"], axis=1, inplace=True)

In [55]:
# Save future dataset separately
future_test_df.to_csv("./rossmann-store-sales/future_6_weeks_data_scaled.csv", index=False)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [56]:
# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_cols_train:
    le = LabelEncoder()
    Data_for_train[col] = le.fit_transform(Data_for_train[col].astype(str))
    Data_for_test[col] = le.fit_transform(Data_for_test[col].astype(str))
    label_encoders[col] = le

In [57]:
# Apply StandardScaler to numerical features
scaler = StandardScaler()
Data_for_train[numerical_cols_for_train] = scaler.fit_transform(Data_for_train[numerical_cols_for_train])
Data_for_test[numerical_cols_for_test] = scaler.fit_transform(Data_for_test[numerical_cols_for_test])

In [58]:
# Drop 'Date' column
Data_for_train.drop(["Date"], axis=1, inplace=True)
Data_for_test.drop(["Date"], axis=1, inplace=True)

In [60]:
# Save processed data if needed
Data_for_train.to_csv("./rossmann-store-sales/processed_train_new.csv", index=False)
Data_for_test.to_csv("./rossmann-store-sales/processed_test.csv", index=False)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# # Define target variable (y) and features (X)
# X = Data_for_train.drop(columns=["Sales","Customers"])  # Features
# Y = Data_for_train["Sales"]  # Target

In [None]:
# Linear Models
# models = {
#     "Linear Regression": LinearRegression(),
#     "Ridge Regression": Ridge(alpha=1.0),
#     "Lasso Regression": Lasso(alpha=0.1)
# }

In [None]:
# Tree-Based Models
# models.update({
#     "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
#     "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
#     "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
# })

In [None]:
# X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Train and evaluate each model
# results = {}
# for name, model in models.items():
#     model.fit(X_train, Y_train)
#     Y_pred = model.predict(X_val)
    
#     mae = mean_absolute_error(Y_val, Y_pred)
#     r2 = r2_score(Y_val, Y_pred)
    
#     results[name] = {"MAE": mae, "R²": r2}
#     print(f"{name}: MAE = {mae:.2f}, R² = {r2:.3f}")

In [None]:
# Show results sorted by R²
# sorted_results = sorted(results.items(), key=lambda x: x[1]["R²"], reverse=True)

# print("\n🔹 Model Performance (Sorted by Highest R²):")
# for name, scores in sorted_results:
#     print(f"{name}: MAE = {scores['MAE']:.2f}, R² = {scores['R²']:.3f}")

In [None]:
# from sklearn.svm import SVR

# svr = SVR(kernel='rbf')  # Uses distance-based radial basis function (RBF)
# svr.fit(X_train, Y_train)
# svr_predictions = svr.predict(X_val)

In [None]:
# from sklearn.cluster import KMeans

# kmeans = KMeans(n_clusters=5, random_state=42)
# kmeans.fit(X_train)
# cluster_labels = kmeans.predict(X_val)

In [None]:
# Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300, 500],  # Number of trees
#     'max_depth': [10, 20, 30, None],  # Maximum depth of trees
#     'min_samples_split': [2, 5, 10],  # Minimum samples required to split
#     'min_samples_leaf': [1, 2, 4],  # Minimum samples per leaf
#     'max_features': ['sqrt', 'log2', None]  # Remove 'auto'
# }

In [None]:
# Initialize the model
# rf = RandomForestRegressor(random_state=42)

In [None]:
# Perform Randomized Search
# rf_random = RandomizedSearchCV(
#     estimator=rf, 
#     param_distributions=param_grid, 
#     n_iter=10, cv=3, 
#     scoring='r2', verbose=2, 
#     n_jobs=-1, random_state=42
# )

In [None]:
# Train using RandomizedSearchCV
# rf_random.fit(X, Y)

In [None]:
# Print best parameters
# print(f"🔹 Best Parameters: {rf_random.best_params_}")

In [None]:
# Get the best model
# best_rf = rf_random.best_estimator_

In [None]:
# Train the best model on full data
# best_rf.fit(X, Y)

In [None]:
# To improve R2
# Feature Engineering
# X["is_weekend"] = (X["DayOfWeek"] >= 6).astype(int)  # 1 if Saturday/Sunday
# X["is_start_of_month"] = (X["Day"] <= 5).astype(int)  # Start of month effect
# X["is_end_of_month"] = (X["Day"] >= 25).astype(int)  # End of month effect

In [None]:
# Get feature importances from best RF model
# importances = best_rf.feature_importances_
# feature_names = X.columns
# feature_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
# feature_df = feature_df.sort_values(by="Importance", ascending=False)

In [None]:
# Plot feature importance
# plt.figure(figsize=(10, 6))
# plt.barh(feature_df["Feature"][:10], feature_df["Importance"][:10])
# plt.xlabel("Importance Score")
# plt.ylabel("Feature Name")
# plt.title("Top 10 Most Important Features")
# plt.gca().invert_yaxis()
# plt.show()

Hyperparameter Tuning for Random Forest

In [None]:
# Further refine hyperparameters
# param_grid = {
#     'n_estimators': [300, 400],  # Reduce to avoid memory issues
#     'max_depth': [10, 20],  # Avoid unrestricted depth
#     'min_samples_split': [2, 3],  # Reduce split threshold
#     'min_samples_leaf': [1, 2],  # Avoid very small leaf nodes
#     'max_features': ['sqrt']
# }

In [None]:
# grid_search = GridSearchCV(
#     estimator=RandomForestRegressor(random_state=42),
#     param_grid=param_grid,
#     cv=3,  # Reduce cross-validation folds
#     scoring='r2',
#     n_jobs=4,  # Reduce parallelism
#     verbose=2
# )

In [None]:
# grid_search.fit(X, Y)

In [None]:
# print(f"🔹 Best Parameters: {grid_search.best_params_}")

# best_rf = grid_search.best_estimator_

In [None]:
# xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=10, random_state=42)
# xgb.fit(X, Y)

In [None]:
# r2_xgb = xgb.score(X_val, Y_val)
# print(f"🔹 XGBoost R²: {r2_xgb:.3f}")

In [None]:
# ensemble_model = VotingRegressor(
#     estimators=[("rf", best_rf), ("xgb", xgb)]
# )

In [None]:
# ensemble_model.fit(X, Y)

In [None]:
# r2_ensemble = ensemble_model.score(X_val, Y_val)
# print(f"🔹 Ensemble R²: {r2_ensemble:.3f}")

  Train the final model using the best parameters

In [None]:
# Retrieve the best parameters from the RandomizedSearchCV results
# best_params = rf_random.best_params_

In [None]:
# Train the final RandomForestRegressor model using the best parameters
# best_rf = RandomForestRegressor(
#     n_estimators=best_params['n_estimators'],
#     max_depth=best_params['max_depth'],
#     min_samples_split=best_params['min_samples_split'],
#     min_samples_leaf=best_params['min_samples_leaf'],
#     max_features=best_params['max_features'],
#     random_state=42
# )

In [None]:
# Fit the model on the full training dataset
# best_rf.fit(X, Y)

In [None]:
#Display confirmation message
# print("✅ Final Random Forest model trained successfully with the best parameters!")