In [24]:
# Step 1: Setting Up the Environment
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle
import joblib

In [56]:
# Step 2: Load the Dataset
data = pd.read_csv("/content/validation_for students (1).csv")

In [57]:
# Step 3: Data Preprocessing and Feature Engineering

In [58]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values

index                  0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
dtype: int64

In [59]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(categorical_columns)

['date', 'state_holiday']


# **WHY AM I USING THIS PIECE OF CODE?**

**Numerical Columns:** Using the median value is specially useful when dealing with outliers. The median is less sensitive to extreme values compared to the mean, making it a suitable choice to impute missing values in numerical data.

**Categorical/Binary Columns**: It helps maintain the distribution of categories within the column and doesn't introduce bias toward a specific category.

In [61]:
# Handling missing values for numerical columns
numerical_cols = ['nb_customers_on_day']  # here for numerical values
for col in numerical_cols:
    median_value = data[col].median()
    data[col].fillna(median_value, inplace=True)

# Handling missing values for categorical/binary columns
categorical_cols = ['open', 'promotion', 'state_holiday', 'school_holiday']  # here for categorical values
for col in categorical_cols:
    mode_value = data[col].mode()[0]
    data[col].fillna(mode_value, inplace=True)


In [62]:
remaining_missing = data.isnull().sum()
remaining_missing

index                  0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
dtype: int64

In [63]:
# Encode categorical variables
categorical_cols = ['state_holiday']  # we may add any other categorical columns if needed
data_encoded = pd.get_dummies(data, columns=categorical_cols)

In [65]:
# Specify the date format while converting to datetime
data_encoded['date'] = pd.to_datetime(data_encoded['date'], format='%d/%m/%Y')

# Extracting features from the date column
data_encoded['date_numeric'] = data_encoded['date'].astype(int) // 10**9
data_encoded['month'] = data_encoded['date'].dt.month
data_encoded['day'] = data_encoded['date'].dt.day
data_encoded['year'] = data_encoded['date'].dt.year
data_encoded['weekday'] = data_encoded['date'].dt.weekday

In [66]:
# interaction between 'nb_customers_on_day' and 'open'
data_encoded['customers_open_interaction'] = data_encoded['nb_customers_on_day'] * data_encoded['open']

In [67]:
# Log transformation of 'nb_customers_on_day' if it follows a skewed distribution
data_encoded['log_customers'] = np.log1p(data_encoded['nb_customers_on_day'])

# Feature scaling or normalization if necessary
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_encoded['nb_customers_on_day'] = scaler.fit_transform(data_encoded[['nb_customers_on_day']])

In [68]:
data_encoded.drop(columns=['date'], inplace=True)

In [None]:
# Step 4: Data Splitting
#X = data_encoded
#y = data_encoded['sales']  # Target variable

# Splitting the dataset into 80% training and 20% testing
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [38]:
# Step 5: Model Building. Option 1
# model = RandomForestRegressor(n_estimators=50, max_depth=10)

In [39]:
# model.fit(X_train, y_train)

I need to run eveyrthing again except dropping the sales

## GRADIENT BOOSTING

In [40]:
# Step 5: Model Building. Option 2
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=30, max_depth=10)  # we can adjust parameters as needed depending on how long it may take to load
model.fit(X_train, y_train)

KNN NEIGHBORS

In [41]:
# Step 5: Model Building - Using k-Nearest Neighbors Regressor. Option 3
#from sklearn.neighbors import KNeighborsRegressor

#model_knn = KNeighborsRegressor(n_neighbors=5)
#model_knn.fit(X_train, y_train)

In [49]:
# Step 6: Model Evaluation and Saving
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")

R2 Score: 0.9585409680436666


In [50]:
# defining the range
min_sales = 0
max_sales = 10000

# Save the scaled predictions for later descaling
scaled_predictions = y_pred.copy()

# inverse scaling
descaled_predictions = scaler.inverse_transform((scaled_predictions.reshape(-1, 1) - min_sales) / (max_sales - min_sales))

# Update 'sales_predictions' with descaled values
sales_predictions = descaled_predictions.flatten()

In [51]:
# Save the model
with open('sales_prediction_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [52]:
# Create output file with index and sales prediction
output = pd.DataFrame({'index': X_test.index, 'sales_prediction': y_pred})
output.to_csv('sales_predictions.csv', index=False)

WITH THE VALIDATION DATA SET - AFTER 4PM

In [None]:
# load the pikle model
# predict using the model and the data set
# then save the predictions.

In [73]:
loaded_model = joblib.load('sales_prediction_model.pkl')

In [74]:
predictions = loaded_model.predict(X_test)

In [77]:
# Convert predictions to a DataFrame with index
predictions_df = pd.DataFrame({'Predicted_sales': predictions}, index=X_test.index)

# Save predictions to a CSV file including the index
predictions_df.to_csv('predicted_sales.csv')

In [53]:
!pip install pipreqs



In [54]:
!pipreqs /content



In [55]:
!cat /content/requirements.txt


