# Loading Dataset

In [5]:
import pandas as pd
import numpy as np

# Read ingredients and sales data
ingredients = pd.read_excel('../Datasets/Pizza_ingredients.xlsx')
sales = pd.read_excel('../Datasets/Pizza_Sale.xlsx')

In [6]:
ingredients.head()  # Display the first 5 rows of the ingredients data

Unnamed: 0,pizza_name_id,pizza_name,pizza_ingredients,Items_Qty_In_Grams
0,bbq_ckn_l,The Barbecue Chicken Pizza,Barbecued Chicken,40.0
1,bbq_ckn_l,The Barbecue Chicken Pizza,Red Peppers,15.0
2,bbq_ckn_l,The Barbecue Chicken Pizza,Green Peppers,20.0
3,bbq_ckn_l,The Barbecue Chicken Pizza,Tomatoes,30.0
4,bbq_ckn_l,The Barbecue Chicken Pizza,Red Onions,60.0


In [7]:
sales.head()    # Display the first 5 rows of the sales data

Unnamed: 0,pizza_id,order_id,pizza_name_id,quantity,order_date,order_time,unit_price,total_price,pizza_size,pizza_category,pizza_ingredients,pizza_name
0,1,1,hawaiian_m,1,2015-01-01 00:00:00,11:38:36,13.25,13.25,M,Classic,"Sliced Ham, Pineapple, Mozzarella Cheese",The Hawaiian Pizza
1,2,2,classic_dlx_m,1,2015-01-01 00:00:00,11:57:40,16.0,16.0,M,Classic,"Pepperoni, Mushrooms, Red Onions, Red Peppers,...",The Classic Deluxe Pizza
2,3,2,five_cheese_l,1,2015-01-01 00:00:00,11:57:40,18.5,18.5,L,Veggie,"Mozzarella Cheese, Provolone Cheese, Smoked Go...",The Five Cheese Pizza
3,4,2,ital_supr_l,1,2015-01-01 00:00:00,11:57:40,20.75,20.75,L,Supreme,"Calabrese Salami, Capocollo, Tomatoes, Red Oni...",The Italian Supreme Pizza
4,5,2,mexicana_m,1,2015-01-01 00:00:00,11:57:40,16.0,16.0,M,Veggie,"Tomatoes, Red Peppers, Jalapeno Peppers, Red O...",The Mexicana Pizza


# Changing Datatypes & Feature Engineering

In [8]:
# Check for missing values in the ingredients data
ingredients['pizza_name_id'] = ingredients['pizza_name_id'].astype('category')
ingredients['pizza_name'] = ingredients['pizza_name'].astype('category')
ingredients['pizza_ingredients'] = ingredients['pizza_ingredients'].astype('category')

In [9]:
ingredients.dtypes

pizza_name_id         category
pizza_name            category
pizza_ingredients     category
Items_Qty_In_Grams     float64
dtype: object

In [10]:
# converting the order_date & order_time column to datetime format
sales['order_date'] = pd.to_datetime(sales['order_date'], errors='coerce')  
sales = sales.sort_values('order_date') 
sales['order_time'] = pd.to_datetime(sales['order_time'], format='%H:%M:%S')

sales['Year'] = sales['order_date'].dt.year
sales['Month'] = sales['order_date'].dt.month
sales['Day'] = sales['order_date'].dt.day
sales['Hour'] = sales['order_time'].dt.hour
sales['Minute'] = sales['order_time'].dt.minute
sales['Second'] = sales['order_time'].dt.second

sales.drop(columns=['order_date'], inplace=True)    # dropping the order_date column
sales.drop(columns=['order_time'], inplace=True)    # dropping the order_time column

In [11]:
sales['pizza_id'] = sales['pizza_id'].astype('category')
sales['order_id'] = sales['order_id'].astype('category')
sales['pizza_name_id'] = sales['pizza_name_id'].astype('category')
sales['pizza_category'] = sales['pizza_category'].astype('category')
sales['pizza_name'] = sales['pizza_name'].astype('category')
sales['pizza_size'] = sales['pizza_size'].astype('category')
sales['total_price'] = pd.to_numeric(sales['total_price'], errors='coerce')

In [12]:
sales.dtypes

pizza_id             category
order_id             category
pizza_name_id        category
quantity                int64
unit_price            float64
total_price           float64
pizza_size           category
pizza_category       category
pizza_ingredients      object
pizza_name           category
Year                    int32
Month                   int32
Day                     int32
Hour                    int32
Minute                  int32
Second                  int32
dtype: object

# Merging Datasets & Handling Missing Values

In [13]:
# Drop Duplicate Values
ingredients.drop_duplicates(inplace=True)
sales.drop_duplicates(inplace=True)

In [14]:
sales.shape, ingredients.shape

((48620, 16), (518, 4))

In [15]:
# Split ingredients into lists
sales['pizza_ingredients'] = sales['pizza_ingredients'].str.split(', ')

# Explode the ingredients to separate rows
exploded_sales = sales.explode('pizza_ingredients')

# Reset index (optional)
exploded_sales.reset_index(drop=True, inplace=True)

print(exploded_sales.shape)

(267518, 16)


In [16]:
# Merge the datasets on common columns
df = pd.merge(
    exploded_sales, 
    ingredients, 
    on=['pizza_name_id', 'pizza_name', 'pizza_ingredients'], 
    how='inner'  # Use 'inner' to only include matching rows
)

df.shape

(265090, 17)

In [17]:
del sales, ingredients, exploded_sales      # free memory by deleting dataframes

In [18]:
# Fill missing values
df['total_price'] = df['total_price'].fillna(df['unit_price'] * df['quantity'])
df['Items_Qty_In_Grams'] = df['Items_Qty_In_Grams'].interpolate(method='linear')
df['pizza_category'] = df['pizza_category'].fillna(df['pizza_category'].ffill())
df['Items_Qty_In_Grams'] = df['Items_Qty_In_Grams'].fillna(df['Items_Qty_In_Grams'].mean())
df['pizza_ingredients'] = df['pizza_ingredients'].astype('category')

In [19]:
df['pizza_category'].value_counts()

pizza_category
Veggie     72161
Supreme    67282
Chicken    66305
Classic    59342
Name: count, dtype: int64

# Removing Outliers

In [20]:
from scipy.stats import zscore  # Importing the zscore function
import numpy as np  # Importing the numpy library

# Function to remove outliers
def remove_outliers(df, threshold=3):
    # Select only numerical columns
    numerical_df = df.select_dtypes(include=['float', 'int64'])

    # Calculate Z-scores for numerical columns
    z_scores = np.abs((numerical_df - numerical_df.mean()) / numerical_df.std())

    # Create a mask for rows where all numerical Z-scores are below the threshold
    mask = (z_scores < threshold).all(axis=1)

    # Apply the mask to the original DataFrame to keep all columns
    df_clean = df[mask].copy()

    return df_clean  # Returning the cleaned DataFrame

# Assuming `ingredients` and `sales` are pre-defined pandas DataFrames
print("Before removing outliers of ingredients:", df.shape)
df = remove_outliers(df)  # Remove outliers for `ingredients`
print("After removing outliers of ingredients:", df.shape)

Before removing outliers of ingredients: (265090, 17)
After removing outliers of ingredients: (249779, 17)


# Encoding Categorical Values

In [21]:
cat_cols = df.select_dtypes('category').columns.tolist()

In [22]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# label_encoders = {}

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

df.head()

Unnamed: 0,pizza_id,order_id,pizza_name_id,quantity,unit_price,total_price,pizza_size,pizza_category,pizza_ingredients,pizza_name,Year,Month,Day,Hour,Minute,Second,Items_Qty_In_Grams
0,0,0,27,1,13.25,13.25,1,1,55,12,2015,1,1,11,38,36,33.402357
1,0,0,27,1,13.25,13.25,1,1,46,12,2015,1,1,11,38,36,30.0
2,0,0,27,1,13.25,13.25,1,1,36,12,2015,1,1,11,38,36,20.0
3,96,40,54,1,16.5,16.5,1,2,26,19,2015,1,1,17,28,9,60.0
4,96,40,54,1,16.5,16.5,1,2,13,19,2015,1,1,17,28,9,100.0


In [23]:
df.to_csv('processed_data.csv', index=False)    # saving the cleaned data to a csv file