Build Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
sales_per_day_per_store=pd.read_csv("sales_per_day_per_store.csv")

In [None]:
# Convert 'date' to datetime and extract additional features
sales_per_day_per_store['date'] = pd.to_datetime(sales_per_day_per_store['date'])
sales_per_day_per_store['day'] = sales_per_day_per_store['date'].dt.day
sales_per_day_per_store['month'] = sales_per_day_per_store['date'].dt.month
sales_per_day_per_store['weekday'] = sales_per_day_per_store['date'].dt.weekday

# Drop the 'date' column since it's no longer needed
sales_per_day_per_store.drop(columns=['date'], inplace=True)

# One-hot encode categorical variables ('city', 'state', 'type')
sales_per_day_per_store = pd.get_dummies(sales_per_day_per_store, columns=['city', 'state', 'type','is_holiday'], drop_first=True)

In [None]:
# Define target variable (y) and features (X)
X = sales_per_day_per_store.drop(columns=['sales','transactions'])
y = sales_per_day_per_store['sales']

# First, split 70% of the data into training, and 30% into validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Now, split the remaining 30% into 50% validation and 50% test (which equals 15% each of the total data)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming you have fitted a scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit on training data
X_valid_scaled = scaler.transform(X_valid)  # Transform validation data using the same scaler
X_test_scaled = scaler.transform(X_test)  # Transform test data using the same scaler

In [None]:
from sklearn.linear_model import LinearRegression

# Re-initialize and re-train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)  # Train on scaled data

# Make predictions again
y_valid_pred = model.predict(X_valid_scaled)

In [None]:
# Check the predictions
y_train_pred = model.predict(X_train_scaled)
y_valid_pred = model.predict(X_valid_scaled)
y_test_pred = model.predict(X_test_scaled)

In [None]:
def rmsle(y_true, y_pred):
    """
    Calculate Root Mean Squared Logarithmic Error (RMSLE).
    
    Parameters:
    y_true (array-like): Actual values
    y_pred (array-like): Predicted values
    
    Returns:
    float: RMSLE score
    """
    # Convert inputs to numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    # Compute the logarithm of (1 + values)
    log_true = np.log1p(1+y_true)
    log_pred = np.log1p(1+y_pred)
    
    # Compute squared error
    squared_error = (log_pred - log_true) ** 2
    
    # Compute mean and return the root mean squared logarithmic error
    return np.sqrt(np.mean(squared_error))

In [None]:
# Calculate MSE and R-squared for the validation set
y_valid_pred = np.maximum(0, y_valid_pred)  # Clip negative values to zero
mse_valid = rmsle(y_valid, y_valid_pred)
r2_valid = r2_score(y_valid, y_valid_pred)

print(f'Validation Mean Squared Error: {mse_valid}')
print(f'Validation R-squared: {r2_valid}')

Validation Mean Squared Error: 2.455173073447924
Validation R-squared: 0.5161357508648614


In [None]:
# Make predictions on the test data
y_test_pred = model.predict(X_test_scaled)
# Calculate MSE and R-squared for the validation set
y_test_pred = np.maximum(0, y_test_pred)  # Clip negative values to zero

# Calculate MSE and R-squared for the test set
mse_test = rmsle(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f'Test Mean Squared Error: {mse_test}')
print(f'Test R-squared: {r2_test}')

Test Mean Squared Error: 2.4484117626693247
Test R-squared: 0.515085619222155
