In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
sales_df = pd.read_csv('sales.csv')
sales_df.drop(columns = ['Unnamed: 0'], inplace = True)
sales_df.head()

In [None]:
stock_df = pd.read_csv('sensor_stock_levels.csv')
stock_df.drop(columns = ['Unnamed: 0'], inplace = True)
stock_df.head()

In [None]:
temp_df = pd.read_csv('sensor_storage_temperature.csv')
temp_df.drop(columns=['Unnamed: 0'],inplace=True,errors='ignore')
temp_df.head()

In [None]:
sales_df.info()
stock_df.info()
temp_df.info()

In [None]:
sales_df.isna().sum()

In [None]:
stock_df.isna().sum()

In [None]:
temp_df.isna().sum()

In [None]:
def convert_to_datetime(data: pd.DataFrame = None, column: str = None):
    dummy = data.copy()
    dummy[column] = pd.to_datetime(dummy[column], format = '%Y-%m-%d %H:%M:%S')
    return dummy

In [None]:
sales_df = convert_to_datetime(sales_df, 'timestamp')
sales_df.dtypes

stock_df = convert_to_datetime(stock_df, 'timestamp')
stock_df.dtypes

temp_df = convert_to_datetime(temp_df, 'timestamp')
temp_df.dtypes

In [None]:
from datetime import datetime

def convert_to_hourly(data: pd.DataFrame = None, column: str = None):
    dummy = data.copy()
    new = dummy[column].tolist()
    new = [i.strftime('%Y-%m-%d %H:00:00') for i in new]
    new = [datetime.strptime(i,'%Y-%m-%d %H:00:00') for i in new]
    dummy[column] = new
    return dummy

In [None]:
sales_df = convert_to_hourly(sales_df, 'timestamp')
sales_df.head()

In [None]:
stock_df = convert_to_hourly(stock_df, 'timestamp')
stock_df.head()

In [None]:
temp_df = convert_to_hourly(temp_df, 'timestamp')
temp_df.head()

In [None]:
sales_agg = sales_df.groupby(['timestamp', 'product_id']).agg({'quantity': 'sum'}).reset_index()
sales_agg.head()

In [None]:
stock_agg = stock_df.groupby(['timestamp', 'product_id']).agg({'estimated_stock_pct': 'mean'}).reset_index()
stock_agg.head()

In [None]:
temp_agg = temp_df.groupby(['timestamp']).agg({'temperature': 'mean'}).reset_index()
temp_agg.head()

In [None]:
merged_df = stock_agg.merge(sales_agg, on = ['timestamp', 'product_id'], how = 'left')
merged_df.head()

In [None]:
merged_df = merged_df.merge(temp_agg, on = ['timestamp'], how = 'left')
merged_df.head()

In [None]:
merged_df['quantity'] = merged_df['quantity'].fillna(0)
merged_df.info()

In [None]:
product_categories = sales_df[['product_id','category']]
product_categories = product_categories.drop_duplicates()

product_price = sales_df[['product_id','unit_price']]
product_price = product_price.drop_duplicates()

In [None]:
merged_df = merged_df.merge(product_categories, on = 'product_id', how = 'left')
merged_df.head()

In [None]:
merged_df = merged_df.merge(product_price, on = 'product_id', how = 'left')
merged_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

merged_df['category'] = le.fit_transform(merged_df['category'])
merged_df.head()

In [None]:
merged_df['timestamp_day_of_month'] = merged_df['timestamp'].dt.day
merged_df['timestamp_day_of_week'] = merged_df['timestamp'].dt.dayofweek
merged_df['timestamp_day_of_hour'] = merged_df['timestamp'].dt.hour
merged_df.drop(columns = ['timestamp'], inplace = True) 
merged_df.head()

In [None]:
merged_df.drop(columns = ['product_id'], inplace = True)
merged_df.head()

In [None]:
X = merged_df.drop(columns = ['estimated_stock_pct'])
y = merged_df['estimated_stock_pct']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,y_train)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_absolute_error

accuracy = []
for fold in range(0, 10):
    mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
accuracy.append(mae)
print(f"Fold {fold + 1}: MAE = {mae:.3f}")

print(f"Average MAE: {(sum(accuracy) / len(accuracy)):.2f}")

In [None]:
features = [i.split("__")[0] for i in X.columns]
importances = model.feature_importances_
indices = np.argsort(importances)

fig, ax = plt.subplots(figsize=(5, 5))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()