In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os 


In [2]:
dir =  os.getcwd()
FILE = dir [:-21] + "SQL/SQL_output.csv" 

In [3]:
df = pd.read_csv(FILE)
df if df.isna().sum(axis=1).any() else print("No missing values") # checking for null values



No missing values


In [4]:
df

Unnamed: 0,quantity,date,time,size,pizza_type_id,price,category,ingredients
0,1,2015-01-03,14:22:10,M,hawaiian,13.25,Classic,"Sliced Ham, Pineapple, Mozzarella Cheese"
1,1,2015-01-03,14:32:51,XL,the_greek,25.50,Classic,"Kalamata Olives, Feta Cheese, Tomatoes, Garlic..."
2,1,2015-01-03,14:40:42,S,mediterraneo,12.00,Veggie,"Spinach, Artichokes, Kalamata Olives, Sun-drie..."
3,1,2015-01-03,14:48:45,M,spinach_fet,16.00,Veggie,"Spinach, Mushrooms, Red Onions, Feta Cheese, G..."
4,1,2015-01-03,14:49:58,M,pepperoni,12.50,Classic,"Mozzarella Cheese, Pepperoni"
...,...,...,...,...,...,...,...,...
48615,1,2015-12-31,14:40:24,L,southw_ckn,20.75,Chicken,"Chicken, Tomatoes, Red Peppers, Red Onions, Ja..."
48616,1,2015-12-31,14:40:24,M,southw_ckn,16.75,Chicken,"Chicken, Tomatoes, Red Peppers, Red Onions, Ja..."
48617,1,2015-12-31,14:40:24,S,spicy_ital,12.50,Supreme,"Capocollo, Tomatoes, Goat Cheese, Artichokes, ..."
48618,1,2015-12-31,14:43:46,L,napolitana,20.50,Classic,"Tomatoes, Anchovies, Green Olives, Red Onions,..."


In [5]:
df.rename(columns={'pizza_type_id': 'pizza_flavor'}, inplace=True) # I think this name is more appropriate


In [6]:
df.groupby(df['category'])['quantity'].count() / df['quantity'].count() 
# percentage of each category in the total sales, seems balanced but classic is the most popular

category
Chicken    0.222439
Classic    0.299856
Supreme    0.242225
Veggie     0.235479
Name: quantity, dtype: float64

In [7]:
unique_flavors_per_category = df.groupby('category')['pizza_flavor'].nunique()
unique_flavors_per_category 

# distribution of flavors per category, veggie and supreme has the most variety

# Case 1: Higher Variety of pizzas can lead to higher sales
# but this may not be the case, as veggie and supreme 
# has the highest variety but not the highest sales

#   - this may be due to the fact that veggie and supreme are more expensive
#   - this may be due to the fact that veggie and supreme are less popular
#   - this may be due to the fact that veggie and supreme are not as good as classic

# Check if price for supreme and veggie is higher than classic, then reduce the price
# to see if sales increase

# if price for supreme and veggie is lower/equal than classic, then the pizza's not as good
# and needs less variety

category
Chicken    6
Classic    8
Supreme    9
Veggie     9
Name: pizza_flavor, dtype: int64

In [8]:
df['time'] = df.time.str[:-3] # removes the seconds from the time column

In [9]:
# Splitting the date into month and days
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

# Year doesnt matter since it doesnt change, dropped it
df.drop(columns=['date'],inplace=True) # 


In [18]:
# Fixing typing errors in 'ingredients' column

df['ingredients'] = df['ingredients'].str.replace(', ', ',') # one space
df['ingredients'] = df['ingredients'].str.replace(',  ', ',') # two spaces after comma



In [23]:
# Checking to see if it worked
for i, ingredient in enumerate(df['ingredients']):
    if i > 10: # I don't want all values just a few
        break
    print(ingredient)

Sliced Ham,Pineapple,Mozzarella Cheese
Kalamata Olives,Feta Cheese,Tomatoes,Garlic,Beef Chuck Roast,Red Onions
Spinach,Artichokes,Kalamata Olives,Sun-dried Tomatoes,Feta Cheese,Plum Tomatoes,Red Onions
Spinach,Mushrooms,Red Onions,Feta Cheese,Garlic
Mozzarella Cheese,Pepperoni
Chicken,Tomatoes,Red Peppers,Red Onions,Jalapeno Peppers,Corn,Cilantro,Chipotle Sauce
Spinach,Red Onions,Pepperoni,Tomatoes,Artichokes,Kalamata Olives,Garlic,Asiago Cheese
Sliced Ham,Pineapple,Mozzarella Cheese
Spinach,Mushrooms,Red Onions,Feta Cheese,Garlic
Chicken,Pineapple,Tomatoes,Red Peppers,Thai Sweet Chilli Sauce
Kalamata Olives,Feta Cheese,Tomatoes,Garlic,Beef Chuck Roast,Red Onions


In [20]:
# Perform one-hot encoding on the 'ingredients' column
ingredients_dummies = df['ingredients'].str.get_dummies(',')

# Add the new columns to the original DataFrame
df = pd.concat([df, ingredients_dummies], axis=1)

In [21]:
pd.set_option('display.max_columns', None)
# So i can see all columns to make sure it worked

In [22]:

df # checking if the one-hot encoding worked #Mozzarella Cheese Mozzarella Cheese

Unnamed: 0,quantity,time,size,pizza_flavor,price,category,ingredients,month,day,Alfredo Sauce,Anchovies,Artichoke,Artichokes,Arugula,Asiago Cheese,Bacon,Barbecue Sauce,Barbecued Chicken,Beef Chuck Roast,Blue Cheese,Brie Carre Cheese,Calabrese Salami,Capocollo,Caramelized Onions,Chicken,Chipotle Sauce,Chorizo Sausage,Cilantro,Coarse Sicilian Salami,Corn,Eggplant,Feta Cheese,Fontina Cheese,Friggitello Peppers,Garlic,Genoa Salami,Goat Cheese,Gorgonzola Piccante Cheese,Gouda Cheese,Green Olives,Green Peppers,Italian Sausage,Jalapeno Peppers,Kalamata Olives,Luganega Sausage,Mozzarella Cheese,Mushrooms,Onions,Oregano,Pancetta,Parmigiano Reggiano Cheese,Pears,Peperoncini verdi,Pepperoni,Pesto Sauce,Pineapple,Plum Tomatoes,Prosciutto,Prosciutto di San Daniele,Provolone Cheese,Red Onions,Red Peppers,Ricotta Cheese,Romano Cheese,Sliced Ham,Smoked Gouda Cheese,Soppressata Salami,Spinach,Sun-dried Tomatoes,Thai Sweet Chilli Sauce,Thyme,Tomatoes,Zucchini,�Nduja Salami
0,1,14:22,M,hawaiian,13.25,Classic,"Sliced Ham,Pineapple,Mozzarella Cheese",1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,1,14:32,XL,the_greek,25.50,Classic,"Kalamata Olives,Feta Cheese,Tomatoes,Garlic,Be...",1,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,14:40,S,mediterraneo,12.00,Veggie,"Spinach,Artichokes,Kalamata Olives,Sun-dried T...",1,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0
3,1,14:48,M,spinach_fet,16.00,Veggie,"Spinach,Mushrooms,Red Onions,Feta Cheese,Garlic",1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,14:49,M,pepperoni,12.50,Classic,"Mozzarella Cheese,Pepperoni",1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48615,1,14:40,L,southw_ckn,20.75,Chicken,"Chicken,Tomatoes,Red Peppers,Red Onions,Jalape...",12,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
48616,1,14:40,M,southw_ckn,16.75,Chicken,"Chicken,Tomatoes,Red Peppers,Red Onions,Jalape...",12,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
48617,1,14:40,S,spicy_ital,12.50,Supreme,"Capocollo,Tomatoes,Goat Cheese,Artichokes,Pepe...",12,31,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
48618,1,14:43,L,napolitana,20.50,Classic,"Tomatoes,Anchovies,Green Olives,Red Onions,Garlic",12,31,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
