In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import logging

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [4]:
train_data = pd.read_csv('C:\\Users\\Abdilala\\Documents\\GitHub\\Data-weak4\\Data\\rossmann-store-sales\\train.csv', low_memory=False)
test_data = pd.read_csv('C:\\Users\\Abdilala\\Documents\\GitHub\\Data-weak4\\Data\\rossmann-store-sales\\test.csv', low_memory=False)

In [5]:
logger.info("Starting data preprocessing.")

2025-01-03 11:39:26,922 - INFO - Starting data preprocessing.


In [7]:
# Step 1: Handle NaN values
logger.info("Handling NaN values.")
train_data.ffill(inplace=True)

2025-01-03 11:41:08,845 - INFO - Handling NaN values.


In [8]:
#Feature Extraction from datetime columns
logger.info("Extracting features from datetime columns.")

2025-01-03 11:41:44,518 - INFO - Extracting features from datetime columns.


In [9]:
train_data['Date'] = pd.to_datetime(train_data['Date'])

In [10]:
# Create new features
train_data['Weekday'] = train_data['Date'].dt.weekday  # 0 = Monday, 6 = Sunday
train_data['Is_Weekend'] = (train_data['Weekday'] >= 5).astype(int) 
train_data['Days_to_Holiday'] = train_data['Date'].apply(lambda x: (pd.to_datetime('2023-12-25') - x).days)  
train_data['Days_After_Holiday'] = train_data['Date'].apply(lambda x: (x - pd.to_datetime('2023-12-25')).days if x >= pd.to_datetime('2023-12-25') else 0)

In [11]:
# Beginning, mid, and end of the month
train_data['Beginning_of_Month'] = (train_data['Date'].dt.day <= 10).astype(int)
train_data['Mid_of_Month'] = ((train_data['Date'].dt.day > 10) & (train_data['Date'].dt.day <= 20)).astype(int)
train_data['End_of_Month'] = (train_data['Date'].dt.day > 20).astype(int)


In [12]:
train_data['Month'] = train_data['Date'].dt.month  # Month as a feature
train_data['Year'] = train_data['Date'].dt.year

In [13]:
logger.info("Feature extraction completed.")

2025-01-03 12:16:13,309 - INFO - Feature extraction completed.


In [14]:
logger.info("Scaling the data.")
features_to_scale = ['Weekday', 'Is_Weekend', 'Days_to_Holiday', 'Days_After_Holiday', 
                     'Beginning_of_Month', 'Mid_of_Month', 'End_of_Month', 'Month', 'Year']

2025-01-03 12:16:16,887 - INFO - Scaling the data.


In [15]:
scaler = StandardScaler()
train_data[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])

In [16]:
logger.info("Data scaling completed.")

2025-01-03 12:16:21,923 - INFO - Data scaling completed.
