**Import Necessary Library**

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as mpl 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from ydata_profiling import ProfileReport
import holidays 

**Loading dataset**

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

**Data profiling**

In [3]:
#view data
train_data.head(5)

Unnamed: 0,ID,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,28.946465
1,1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,,MeAt,51.549053
2,4,2022-02-01,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt,53.008323
3,5,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt,48.621527
4,6,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT,44.156984


In [5]:
# train and test data information
print(f"train data info: {train_data.info()}")
print(f"test data info: {test_data.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                911 non-null    int64  
 1   date              911 non-null    object 
 2   meals_served      911 non-null    int64  
 3   kitchen_staff     911 non-null    int64  
 4   temperature_C     911 non-null    float64
 5   humidity_percent  911 non-null    float64
 6   day_of_week       911 non-null    int64  
 7   special_event     911 non-null    int64  
 8   past_waste_kg     911 non-null    float64
 9   staff_experience  747 non-null    object 
 10  waste_category    911 non-null    object 
 11  food_waste_kg     911 non-null    float64
dtypes: float64(4), int64(5), object(3)
memory usage: 85.5+ KB
train data info: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  


In [6]:
# train and test data describtion
print(f"train data describe: {train_data.describe()}")

train data describe:                 ID  meals_served  kitchen_staff  temperature_C  \
count   911.000000    911.000000     911.000000     911.000000   
mean    528.327113    375.405049      11.900110      22.189280   
std     305.072794    502.812717       4.285153       8.922389   
min       0.000000    100.000000       5.000000     -10.372207   
25%     266.000000    211.000000       8.000000      15.684585   
50%     531.000000    306.000000      12.000000      22.115040   
75%     795.500000    407.000000      15.000000      28.804294   
max    1049.000000   4730.000000      19.000000      60.000000   

       humidity_percent  day_of_week  special_event  past_waste_kg  \
count        911.000000   911.000000     911.000000     911.000000   
mean          60.761313     3.014270       0.085620      27.015691   
std           17.330821     2.009542       0.279956      12.774223   
min           30.121111     0.000000       0.000000       5.008394   
25%           46.017835     1.0000

In [7]:
train_data.isnull().sum()

ID                    0
date                  0
meals_served          0
kitchen_staff         0
temperature_C         0
humidity_percent      0
day_of_week           0
special_event         0
past_waste_kg         0
staff_experience    164
waste_category        0
food_waste_kg         0
dtype: int64

In [8]:
train_data.nunique()

ID                  911
date                867
meals_served        373
kitchen_staff        15
temperature_C       892
humidity_percent    867
day_of_week           7
special_event         2
past_waste_kg       867
staff_experience      4
waste_category        5
food_waste_kg       867
dtype: int64

In [38]:
train_data.duplicated().sum()

0

In [12]:
# Generate a profiling report
profile = ProfileReport(train_data, title="Train Data Profiling Report")

In [20]:
profile.to_file("train_data_profile.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
profile.to_notebook_iframe()

**Data Cleaning**

# Structure fix

In [9]:
# Handling inconsistent data entries
train_data["staff_experience"] = train_data["staff_experience"].str.lower()

In [None]:
# Drop irrelevant column 
train_data = train_data.drop(columns = ["ID", "day_of_week"], axis=1)

# Content fix

In [5]:
# Handling missing value 
train_data["staff_experience"] = train_data["staff_experience"].fillna(train_data["staff_experience"].mode()[0])

In [6]:
# Handle outlier 
train_data["temperature_C"] = train_data["temperature_C"].clip(lower=10, upper=40)

# Feature engineering and prepartaion

In [33]:
train_data.head(10)

Unnamed: 0,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,...,year,month,day,dayofweek,is_weekend,dayofweek_sin,dayofweek_cos,month_sin,month_cos,is_holiday
0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,...,2022,12,19,0,0,0.0,1.0,-2.449294e-16,1.0,0
1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,Beginner,MeAt,...,2023,11,21,1,0,0.781831,0.62349,-0.5,0.8660254,0
2,2022-02-01,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt,...,2022,2,1,1,0,0.781831,0.62349,0.8660254,0.5,0
3,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt,...,2023,3,19,6,1,-0.781831,0.62349,1.0,6.123234000000001e-17,0
4,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT,...,2022,7,18,0,0,0.0,1.0,-0.5,-0.8660254,0
5,2023-03-02,241,18,16.863506,79.285919,3,0,11.834878,Intermediate,dairy,...,2023,3,2,3,0,0.433884,-0.900969,1.0,6.123234000000001e-17,0
6,2022-04-18,443,16,19.888627,77.328136,0,0,22.862659,Beginner,Vegetables,...,2022,4,18,0,0,0.0,1.0,0.8660254,-0.5,0
7,2023-12-16,416,16,18.559591,75.786502,5,1,34.599442,Intermediate,MeAt,...,2023,12,16,5,1,-0.974928,-0.222521,-2.449294e-16,1.0,0
8,2023-07-07,439,18,24.111027,43.395803,4,0,17.459149,EXPERT,dairy,...,2023,7,7,4,0,-0.433884,-0.900969,-0.5,-0.8660254,0
9,2023-11-07,267,7,25.412493,89.405183,1,0,23.067392,EXPERT,GRAINS,...,2023,11,7,1,0,0.781831,0.62349,-0.5,0.8660254,0


In [34]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              911 non-null    datetime64[ns]
 1   meals_served      911 non-null    int64         
 2   kitchen_staff     911 non-null    int64         
 3   temperature_C     911 non-null    float64       
 4   humidity_percent  911 non-null    float64       
 5   day_of_week       911 non-null    int64         
 6   special_event     911 non-null    int64         
 7   past_waste_kg     911 non-null    float64       
 8   staff_experience  911 non-null    object        
 9   waste_category    911 non-null    object        
 10  food_waste_kg     911 non-null    float64       
 11  year              911 non-null    int32         
 12  month             911 non-null    int32         
 13  day               911 non-null    int32         
 14  dayofweek         911 non-

In [None]:
# 1 Feature extraction from date
# compontent extraction
train_data["date"] = pd.to_datetime(train_data["date"])
train_data["year"] = train_data["date"].dt.year
train_data["month"] = train_data["date"].dt.month
train_data["day"] = train_data["date"].dt.day
train_data["dayofweek"] = train_data["date"].dt.dayofweek
train_data["is_weekend"] = (train_data["date"].dt.dayofweek >=5).astype(int)



In [20]:
# Cyclical encoding
# for Day of week
train_data["dayofweek_sin"] = np.sin(2 * np.pi * train_data["dayofweek"]/7)
train_data["dayofweek_cos"] = np.cos(2 * np.pi * train_data["dayofweek"]/7)

# for month
train_data["month_sin"] = np.sin(2 * np.pi * train_data["month"]/12)
train_data["month_cos"] = np.cos(2 * np.pi * train_data["month"]/12)

In [28]:
# Holiday indicator
us_holidays = holidays.US(years=train_data['date'].dt.year.unique())
train_data['is_holiday'] = train_data['date'].dt.date.apply(lambda x: 1 if x in us_holidays else 0)

In [None]:
# 2 Feature translation/Encode
