In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from datetime import datetime
import pandas as pd

# Task 2.1: Preprocessing
# Load datasets
train_df = pd.read_csv("../data/rossmann_store_sales/train.csv", low_memory=False)
store_df = pd.read_csv("../data/rossmann_store_sales/store.csv")
sample_submission_df = pd.read_csv("../data/rossmann_store_sales/sample_submission.csv")

# Merge store data with train data
train_df = train_df.merge(store_df, on='Store', how='left')

# Extract datetime features
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['DayOfWeek'] = train_df['Date'].dt.dayofweek
train_df['WeekOfYear'] = train_df['Date'].dt.isocalendar().week
train_df['Month'] = train_df['Date'].dt.month
train_df['Year'] = train_df['Date'].dt.year
train_df['IsWeekend'] = train_df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
train_df['IsBeginningOfMonth'] = train_df['Date'].dt.day <= 10
train_df['IsMidMonth'] = train_df['Date'].dt.day.between(11, 20)
train_df['IsEndOfMonth'] = train_df['Date'].dt.day >= 21

# Encode categorical variables
label_encoders = {}
for col in ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    label_encoders[col] = le

# Scale the data
scaler = StandardScaler()
numeric_cols = ['Customers', 'CompetitionDistance', 'Promo2SinceWeek', 'Promo2SinceYear', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])

# Task 2.2: Build models with sklearn pipelines and hyperparameter tuning
X = train_df.drop(['Sales', 'Date'], axis=1)
y = train_df['Sales']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline with hyperparameter tuning using GridSearchCV
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])

# param_grid = {
#     'regressor__n_estimators': [100, 200],
#     'regressor__max_depth': [None, 10, 20],
#     'regressor__min_samples_split': [2, 5],
#     'regressor__min_samples_leaf': [1, 2]
# }

# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')
# grid_search.fit(X_train, y_train)

# best_pipeline = grid_search.best_estimator_

# # Evaluate model with best parameters
# y_pred = best_pipeline.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)

# print(f"Best Parameters: {grid_search.best_params_}")
# print(f"Mean Squared Error: {mse}")
# print(f"Mean Absolute Error: {mae}")

# # Task 2.4: Post Prediction Analysis
# feature_importances = best_pipeline.named_steps['regressor'].feature_importances_
# plt.figure(figsize=(12, 6))
# plt.bar(X.columns, feature_importances)
# plt.title("Feature Importances")
# plt.xticks(rotation=90)
# plt.show()

# # Display feature importance values as text output
# feature_importance_dict = dict(zip(X.columns, feature_importances))
# sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))
# print("\n--- Feature Importances ---")
# for feature, importance in sorted_feature_importance.items():
#     print(f"{feature}: {importance:.4f}")

# # Confidence intervals
# pred_std = np.std([tree.predict(X_test) for tree in best_pipeline.named_steps['regressor'].estimators_], axis=0)
# conf_interval = 1.96 * pred_std

# plt.figure(figsize=(10, 5))
# plt.plot(y_pred[:100], label="Predictions")
# plt.fill_between(range(100), y_pred[:100] - conf_interval[:100], y_pred[:100] + conf_interval[:100], color='b', alpha=0.2)
# plt.legend()
# plt.title("Confidence Interval for Predictions")
# plt.show()

# # Task 2.5: Serialize models
# timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# model_filename = f"rossmann_model_{timestamp}.pkl"
# joblib.dump(best_pipeline, model_filename)