In [20]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [21]:
#Downloading model and features parameters
model = joblib.load('random_forest_feature_selection_model.pkl')
feature_names = joblib.load('feature_names.pkl')

In [22]:
#Reading df_future.csv
df_future = pd.read_csv('df_future.csv') 

In [23]:
#Transforming column 'date' on datetime format
df_future['date'] = pd.to_datetime(df_future['date'])

In [24]:
#Selecting data from 1/11/2020 till 14/11/2020
df_future_filtered = df_future[(df_future['date'] >= '2020-11-01') & (df_future['date'] <= '2020-11-14')]

In [25]:
#Deleting column 'Unnamed' - this data is not nessesary
if 'Unnamed: 0' in df_future_filtered.columns:
    df_future_filtered = df_future_filtered.drop(columns=['Unnamed: 0'])

In [26]:
#Create day indicators for week, month, year, day and day off
df_future_filtered['day_of_week'] = df_future_filtered['date'].dt.dayofweek
df_future_filtered['is_weekend'] = df_future_filtered['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
df_future_filtered['day_of_month'] = df_future_filtered['date'].dt.day
df_future_filtered['month'] = df_future_filtered['date'].dt.month
df_future_filtered['week_of_year'] = df_future_filtered['date'].dt.isocalendar().week
df_future_filtered['year'] = df_future_filtered['date'].dt.year

In [27]:
#Encoding the data using the On-Hot Encoding for categorical variables
categorical_features = ['category_id', 'day_of_week', 'is_weekend', 'day_of_month', 'month', 'week_of_year', 'year']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(df_future_filtered[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

In [28]:
#Concatenate DataFrame
df_future_encoded = pd.concat([df_future_filtered.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
df_future_encoded = df_future_encoded.drop(columns=categorical_features)

In [29]:
#Deleting columns'sales_quantity' before filtering
if 'sales_quantity' in df_future_encoded.columns:
    df_future_encoded = df_future_encoded.drop(columns=['sales_quantity'])

In [30]:
#Delete all datetime columns before filling in missing values
datetime_columns = df_future_encoded.select_dtypes(include=['datetime64']).columns
df_future_encoded = df_future_encoded.drop(columns=datetime_columns)

In [31]:
#Filling missing values
imputer = SimpleImputer(strategy='mean')
df_future_imputed = imputer.fit_transform(df_future_encoded)

In [32]:
#Scaling data
scaler = StandardScaler()
X_future = scaler.fit_transform(df_future_imputed)

In [33]:
#Create a DataFrame with the correct column names
X_future_df = pd.DataFrame(X_future, columns=df_future_encoded.columns)

In [34]:
#Ensure that the new dataset contains only the features used during model training
X_future_aligned = X_future_df.reindex(columns=feature_names, fill_value=0)

In [35]:
#Predict
future_predictions = model.predict(X_future_aligned)

In [36]:
#Saving output to test_df
test_df = df_future_filtered[['date', 'sku_id']].copy()
test_df['predicted_sales_quantity'] = future_predictions

In [37]:
#Saving test_df data
test_df.to_csv('test_df.csv', index=False)

In [38]:
#Presenting first 5 rows
print(test_df.head())

        date  sku_id  predicted_sales_quantity
0 2020-11-01    1045                  1.544798
1 2020-11-02    1045                  1.689202
2 2020-11-03    1045                  1.689202
3 2020-11-04    1045                  1.689202
4 2020-11-05    1045                  1.689202
