In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import logging

In [36]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [37]:
train_data = pd.read_csv('C:\\Users\\Abdilala\\Documents\\GitHub\\Data-weak4\\Data\\rossmann-store-sales\\train.csv', low_memory=False)
test_data = pd.read_csv('C:\\Users\\Abdilala\\Documents\\GitHub\\Data-weak4\\Data\\rossmann-store-sales\\test.csv', low_memory=False)

In [38]:
logger.info("Starting data preprocessing.")

2025-01-04 13:29:00,155 - INFO - Starting data preprocessing.


In [39]:
#task 2.1 preprocessing
def preprocess_data(data):
    # Handle NaN values
    data.ffill(inplace=True)  # Forward fill; adjust as necessary

    # Convert 'Date' to datetime
    data['Date'] = pd.to_datetime(data['Date'])

    # Feature extraction
    data['Weekday'] = data['Date'].dt.weekday
    data['Is_Weekend'] = (data['Weekday'] >= 5).astype(int)
    data['Days_to_Holiday'] = data['Date'].apply(lambda x: (pd.to_datetime('2023-12-25') - x).days)
    data['Days_After_Holiday'] = data['Date'].apply(lambda x: (x - pd.to_datetime('2023-12-25')).days if x >= pd.to_datetime('2023-12-25') else 0)
    data['Beginning_of_Month'] = (data['Date'].dt.day <= 10).astype(int)
    data['Mid_of_Month'] = ((data['Date'].dt.day > 10) & (data['Date'].dt.day <= 20)).astype(int)
    data['End_of_Month'] = (data['Date'].dt.day > 20).astype(int)
    data['Month'] = data['Date'].dt.month
    data['Year'] = data['Date'].dt.year

    return data

In [40]:
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [41]:
logger.info("Data preprocessing completed.")

2025-01-04 14:13:27,415 - INFO - Data preprocessing completed.


In [42]:
train_data['Date'] = pd.to_datetime(train_data['Date']).apply(lambda x: x.toordinal())
test_data['Date'] = pd.to_datetime(test_data['Date']).apply(lambda x: x.toordinal())

In [43]:
test_data_encod = test_data.copy()

In [44]:
#StateHoliday have catagorical data need to encode into numeric 
test_data = pd.DataFrame({'StateHoliday': [0, 'b', 'c', 'a', 0]})

In [45]:
test_data_encoded = test_data.copy()

In [46]:
holiday_mapping = {
    0: 0,
    'a': 1,
    'b': 2,
   'c': 3
}


In [47]:
# Apply the mapping to both train and test data
train_data['StateHoliday'] = train_data['StateHoliday'].map(holiday_mapping)
#test_data['StateHoliday'] = test_data['StateHoliday'].map(holiday_mapping)
test_data_encoded['StateHoliday'] = test_data_encoded['StateHoliday'].map(holiday_mapping)

In [48]:
print("Original test_data:")
print(test_data)
print("\nEncoded test_data:")
print(test_data_encoded)
print("original data")
print(test_data_encod)

Original test_data:
  StateHoliday
0            0
1            b
2            c
3            a
4            0

Encoded test_data:
   StateHoliday
0             0
1             2
2             3
3             1
4             0
original data
          Id  Store  DayOfWeek    Date  Open  Promo StateHoliday  \
0          1      1          4  735858   1.0      1            0   
1          2      3          4  735858   1.0      1            0   
2          3      7          4  735858   1.0      1            0   
3          4      8          4  735858   1.0      1            0   
4          5      9          4  735858   1.0      1            0   
...      ...    ...        ...     ...   ...    ...          ...   
41083  41084   1111          6  735811   1.0      0            0   
41084  41085   1112          6  735811   1.0      0            0   
41085  41086   1113          6  735811   1.0      0            0   
41086  41087   1114          6  735811   1.0      0            0   
41087  41088

In [49]:
# Check the results
print("Training Data:")
print(train_data)

print("\nTest Data:")
print(test_data)
print("\nTrain copy")
print(test_data_encod)

Training Data:
         Store  DayOfWeek    Date  Sales  Customers  Open  Promo  \
0            1          5  735810   5263        555     1      1   
1            2          5  735810   6064        625     1      1   
2            3          5  735810   8314        821     1      1   
3            4          5  735810  13995       1498     1      1   
4            5          5  735810   4822        559     1      1   
...        ...        ...     ...    ...        ...   ...    ...   
1017204   1111          2  734869      0          0     0      0   
1017205   1112          2  734869      0          0     0      0   
1017206   1113          2  734869      0          0     0      0   
1017207   1114          2  734869      0          0     0      0   
1017208   1115          2  734869      0          0     0      0   

         StateHoliday  SchoolHoliday  Weekday  Is_Weekend  Days_to_Holiday  \
0                 NaN              1        4           0             3069   
1           

In [50]:
print("Training Data Columns:")
print(train_data.columns.tolist())
print(test_data.columns.tolist())
print(test_data_encoded.columns.tolist())
print(test_data_encod.columns.tolist())

Training Data Columns:
['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Weekday', 'Is_Weekend', 'Days_to_Holiday', 'Days_After_Holiday', 'Beginning_of_Month', 'Mid_of_Month', 'End_of_Month', 'Month', 'Year']
['StateHoliday']
['StateHoliday']
['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Weekday', 'Is_Weekend', 'Days_to_Holiday', 'Days_After_Holiday', 'Beginning_of_Month', 'Mid_of_Month', 'End_of_Month', 'Month', 'Year']


In [23]:
X_train = train_data.drop(columns=['Sales'])

In [24]:
X_train = train_data.drop(columns=['Sales']) 
y_train = train_data['Sales']

In [56]:
test_data_encod.drop(columns=['Id'])

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Weekday,Is_Weekend,Days_to_Holiday,Days_After_Holiday,Beginning_of_Month,Mid_of_Month,End_of_Month,Month,Year
0,1,4,735858,1.0,1,0,0,3,0,3021,0,0,1,0,9,2015
1,3,4,735858,1.0,1,0,0,3,0,3021,0,0,1,0,9,2015
2,7,4,735858,1.0,1,0,0,3,0,3021,0,0,1,0,9,2015
3,8,4,735858,1.0,1,0,0,3,0,3021,0,0,1,0,9,2015
4,9,4,735858,1.0,1,0,0,3,0,3021,0,0,1,0,9,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41083,1111,6,735811,1.0,0,0,0,5,1,3068,0,1,0,0,8,2015
41084,1112,6,735811,1.0,0,0,0,5,1,3068,0,1,0,0,8,2015
41085,1113,6,735811,1.0,0,0,0,5,1,3068,0,1,0,0,8,2015
41086,1114,6,735811,1.0,0,0,0,5,1,3068,0,1,0,0,8,2015


In [57]:
X_test = test_data_encod

In [26]:
#Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standard scaling
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Regressor
])

In [27]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standard scaling
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Regressor
])

In [28]:
#Train the model
logger.info("Training the Random Forest Regressor.")
pipeline.fit(X_train, y_train)

2025-01-04 12:34:14,445 - INFO - Training the Random Forest Regressor.


In [63]:
print("X_train columns:", X_train.columns.tolist())
X_test.drop(columns=['Id'])
print("X_test columns:", X_test.columns.tolist())

X_train columns: ['Store', 'DayOfWeek', 'Date', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Weekday', 'Is_Weekend', 'Days_to_Holiday', 'Days_After_Holiday', 'Beginning_of_Month', 'Mid_of_Month', 'End_of_Month', 'Month', 'Year']
X_test columns: ['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Weekday', 'Is_Weekend', 'Days_to_Holiday', 'Days_After_Holiday', 'Beginning_of_Month', 'Mid_of_Month', 'End_of_Month', 'Month', 'Year']


In [64]:
#Make predictions
logger.info("Making predictions on the test set.")
y_pred = pipeline.predict(X_test)

2025-01-04 14:23:08,172 - INFO - Making predictions on the test set.


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Id
Feature names seen at fit time, yet now missing:
- Customers


In [31]:
#Evaluate the model
mse = mean_squared_error(y_test, y_pred)
logger.info(f"Mean Squared Error: {mse:.2f}")

NameError: name 'y_test' is not defined

In [None]:
#Save the pipeline for future use
import joblib
joblib.dump(pipeline, 'random_forest_model_pipeline.pkl')
logger.info("Pipeline saved as 'random_forest_model_pipeline.pkl'.")