# Snack Analysis

### Set Up PySpark Environment

In [None]:
pip install pyspark

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col

In [6]:
#Create a SparkSession:
spark = SparkSession.builder.appName("Food Analysis").getOrCreate()

### Load CSV Files

In [9]:

menu_df = spark.read.csv("menu.csv" , header=True, inferSchema=True)
starbucks_drinks_df = spark.read.csv("starbucks-menu-nutrition-drinks.csv" , header=True, inferSchema=True)
starbucks_food_df = spark.read.csv("starbucks-menu-nutrition-food.csv" , header=True, inferSchema=True) 
starbucks_drinkMenu_expanded_df = spark.read.csv("starbucks_drinkMenu_expanded.csv" , header=True, inferSchema=True) 

### Inspect and Clean Data 

In [11]:
menu_df.printSchema()


root
 |-- Category: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Serving Size: string (nullable = true)
 |-- Calories: integer (nullable = true)
 |-- Calories from Fat: integer (nullable = true)
 |-- Total Fat: double (nullable = true)
 |-- Total Fat (% Daily Value): integer (nullable = true)
 |-- Saturated Fat: double (nullable = true)
 |-- Saturated Fat (% Daily Value): integer (nullable = true)
 |-- Trans Fat: double (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- Cholesterol (% Daily Value): integer (nullable = true)
 |-- Sodium: integer (nullable = true)
 |-- Sodium (% Daily Value): integer (nullable = true)
 |-- Carbohydrates: integer (nullable = true)
 |-- Carbohydrates (% Daily Value): integer (nullable = true)
 |-- Dietary Fiber: integer (nullable = true)
 |-- Dietary Fiber (% Daily Value): integer (nullable = true)
 |-- Sugars: integer (nullable = true)
 |-- Protein: integer (nullable = true)
 |-- Vitamin A (% Daily Value): integer (nul

In [12]:
menu_df.show(5)

+---------+--------------------+--------------+--------+-----------------+---------+-------------------------+-------------+-----------------------------+---------+-----------+---------------------------+------+----------------------+-------------+-----------------------------+-------------+-----------------------------+------+-------+-------------------------+-------------------------+-----------------------+--------------------+
| Category|                Item|  Serving Size|Calories|Calories from Fat|Total Fat|Total Fat (% Daily Value)|Saturated Fat|Saturated Fat (% Daily Value)|Trans Fat|Cholesterol|Cholesterol (% Daily Value)|Sodium|Sodium (% Daily Value)|Carbohydrates|Carbohydrates (% Daily Value)|Dietary Fiber|Dietary Fiber (% Daily Value)|Sugars|Protein|Vitamin A (% Daily Value)|Vitamin C (% Daily Value)|Calcium (% Daily Value)|Iron (% Daily Value)|
+---------+--------------------+--------------+--------+-----------------+---------+-------------------------+-------------+------

In [13]:
menu_df = menu_df.na.drop()

### Perform Analysis

#### Compare Average Calories

In [17]:
# McDonald's average calories
menu_avg_calories = menu_df.groupBy().avg("Calories").collect()[0][0]
print(f"McDonald's Avg Calories: {menu_avg_calories}")



McDonald's Avg Calories: 368.2692307692308


In [None]:

menu_df = spark.read.csv("menu.csv" , header=True, inferSchema=True)
starbucks_drinks_df = spark.read.csv("starbucks-menu-nutrition-drinks.csv" , header=True, inferSchema=True)
starbucks_food_df = spark.read.csv("starbucks-menu-nutrition-food.csv" , header=True, inferSchema=True) 
starbucks_drinkMenu_expanded_df = spark.read.csv("starbucks_drinkMenu_expanded.csv" , header=True, inferSchema=True) 

In [20]:
starbucks_food_df.printSchema()


root
 |-- ��: string (nullable = true)
 |--    C a l o r i e s : string (nullable = true)
 |--    F a t   ( g ) : string (nullable = true)
 |--    C a r b .   ( g ) : string (nullable = true)
 |--    F i b e r   ( g ) : string (nullable = true)
 |--    P r o t e i n   ( g ) : string (nullable = true)



In [21]:
starbucks_food_df = starbucks_food_df.withColumnRenamed(" Calories ", "Calories")


In [23]:
print(starbucks_food_df.columns)


['��', '\x00 \x00C\x00a\x00l\x00o\x00r\x00i\x00e\x00s\x00', '\x00 \x00F\x00a\x00t\x00 \x00(\x00g\x00)\x00', '\x00 \x00C\x00a\x00r\x00b\x00.\x00 \x00(\x00g\x00)\x00', '\x00 \x00F\x00i\x00b\x00e\x00r\x00 \x00(\x00g\x00)\x00', '\x00 \x00P\x00r\x00o\x00t\x00e\x00i\x00n\x00 \x00(\x00g\x00)\x00']


In [26]:
clean_columns = [col_name.encode('utf-8' , 'ignore').decode('utf-8').strip() for col_name in starbucks_food_df.columns]
starbucks_food_df = starbucks_food_df.toDF(*clean_columns)

In [27]:
print(starbucks_food_df.columns)

['��', '\x00 \x00C\x00a\x00l\x00o\x00r\x00i\x00e\x00s\x00', '\x00 \x00F\x00a\x00t\x00 \x00(\x00g\x00)\x00', '\x00 \x00C\x00a\x00r\x00b\x00.\x00 \x00(\x00g\x00)\x00', '\x00 \x00F\x00i\x00b\x00e\x00r\x00 \x00(\x00g\x00)\x00', '\x00 \x00P\x00r\x00o\x00t\x00e\x00i\x00n\x00 \x00(\x00g\x00)\x00']


In [28]:
starbucks_food_df = spark.read.csv("starbucks-menu-nutrition-food.csv", header=True, inferSchema=True, encoding="utf-16")


In [40]:
corrected_columns = [
    "Unknown",
    "Calories",
    "Fat (g)",
    "Carb (g)",
    "Fiber (g)",
    "Protein (g)"
]
starbucks_food_df = starbucks_food_df.toDF(*corrected_columns)


In [41]:
print(starbucks_food_df.columns)


['Unknown', 'Calories', 'Fat (g)', 'Carb (g)', 'Fiber (g)', 'Protein (g)']


In [42]:
starbucks_food_avg_calories = starbucks_food_df.groupBy().avg("Calories").collect()[0][0]
print(f"Starbucks Food Avg Calories: {starbucks_food_avg_calories}")


Starbucks Food Avg Calories: 356.6371681415929


In [43]:
print(starbucks_drinks_df.columns)


['_c0', 'Calories', 'Fat (g)', 'Carb. (g)', 'Fiber (g)', 'Protein', 'Sodium']


In [44]:
cleaned_columns = [col.strip() for col in starbucks_drinks_df.columns]
starbucks_drinks_df = starbucks_drinks_df.toDF(*cleaned_columns)

In [45]:
print(starbucks_drinks_df.columns)

['_c0', 'Calories', 'Fat (g)', 'Carb. (g)', 'Fiber (g)', 'Protein', 'Sodium']


In [46]:
# Drop the unnecessary column and rename the columns properly
starbucks_drinks_df = starbucks_drinks_df.drop("_c0")

In [47]:
print(starbucks_drinks_df.columns)

['Calories', 'Fat (g)', 'Carb. (g)', 'Fiber (g)', 'Protein', 'Sodium']


In [49]:
from pyspark.sql.functions import col

# Cast Calories column to a numeric type (float)
starbucks_drinks_df = starbucks_drinks_df.withColumn("Calories",col("Calories").cast("float"))

# Calculate the average calories
starbucks_drinks_avg_calories = starbucks_drinks_df.groupBy().avg("Calories").collect()[0][0]
print(f"Starbucks Drinks Avg Calories: {starbucks_drinks_avg_calories}")

Starbucks Drinks Avg Calories: 135.16304347826087


In [50]:
# McDonald's average calories
menu_avg_calories = menu_df.groupBy().avg("Calories").collect()[0][0]
print(f"McDonald's Avg Calories: {menu_avg_calories}")

# Starbucks drinks average calories
starbucks_drinks_avg_calories = starbucks_drinks_df.groupBy().avg("Calories").collect()[0][0]
print(f"Starbucks Drinks Avg Calories: {starbucks_drinks_avg_calories}")

# Starbucks food average calories
starbucks_food_avg_calories = starbucks_food_df.groupBy().avg("Calories").collect()[0][0]
print(f"Starbucks Food Avg Calories: {starbucks_food_avg_calories}")


McDonald's Avg Calories: 368.2692307692308
Starbucks Drinks Avg Calories: 135.16304347826087
Starbucks Food Avg Calories: 356.6371681415929


In [51]:
#Combine drinks and food datasets for Starbucks

starbucks_combined_df = starbucks_drinks_df.union(starbucks_drinks_df)

In [53]:
#Filter items under 300 calories

healthy_mcd_items = menu_df.filter(col("Calories") < 300)
healthy_starbucks_items = starbucks_combined_df.filter(col("Calories") < 300)

In [56]:
starbucks_combined_df.printSchema()


root
 |-- Calories: float (nullable = true)
 |-- Fat (g): string (nullable = true)
 |-- Carb. (g): string (nullable = true)
 |-- Fiber (g): string (nullable = true)
 |-- Protein: string (nullable = true)
 |-- Sodium: string (nullable = true)



In [60]:
print(starbucks_combined_df.columns)


['Calories', 'Fat (g)', 'Carb. (g)', 'Fiber (g)', 'Protein', 'Sodium']


In [69]:
starbucks_combined_df = starbucks_combined_df.withColumnRenamed("Carb. (g)", "Carb_g")


In [70]:
starbucks_combined_df = starbucks_combined_df.withColumn("Carb_g", col("Carb_g").cast("float"))


In [71]:
print(starbucks_combined_df.columns)


['Calories', 'Fat (g)', 'Carb_g', 'Fiber (g)', 'Protein', 'Sodium']
