# Day 13: New Milkshake Flavor Selection for Launch

You are a Product Analyst working with the Shake Shack R&D team to evaluate customer ratings for experimental milkshake flavors. Your team has collected ratings data from a small sampling test. Your task is to systematically analyze and clean the ratings data to identify top-performing flavors.

In [None]:
import pandas as pd
import numpy as np

milkshake_ratings_data = [
  {
    "flavor": "Classic Chocolate",
    "rating": 4.5,
    "customer_id": "CUST001",
    "rating_date": "2024-07-05"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 3.8,
    "customer_id": "CUST002",
    "rating_date": "2024-07-10"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 4.2,
    "customer_id": "CUST003",
    "rating_date": "2024-07-15"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 3.5,
    "customer_id": "CUST004",
    "rating_date": "2024-07-20"
  },
  {
    "flavor": "Mocha Bean",
    "rating": null,
    "customer_id": "CUST005",
    "rating_date": "2024-07-25"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4.5,
    "customer_id": "CUST001",
    "rating_date": "2024-07-05"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 5,
    "customer_id": "CUST006",
    "rating_date": "2024-08-01"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 4,
    "customer_id": "CUST007",
    "rating_date": "2024-08-02"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 3.9,
    "customer_id": "CUST008",
    "rating_date": "2024-08-03"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4.8,
    "customer_id": "CUST009",
    "rating_date": "2024-10-04"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 2.5,
    "customer_id": "CUST010",
    "rating_date": "2024-09-05"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4.7,
    "customer_id": "CUST011",
    "rating_date": "2024-10-06"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": null,
    "customer_id": "CUST012",
    "rating_date": "2024-10-07"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 4.3,
    "customer_id": "CUST013",
    "rating_date": "2024-10-08"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4.9,
    "customer_id": "CUST014",
    "rating_date": "2024-10-09"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 3.3,
    "customer_id": "CUST015",
    "rating_date": "2024-08-10"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 1,
    "customer_id": "CUST016",
    "rating_date": "2024-08-11"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 6,
    "customer_id": "CUST017",
    "rating_date": "2024-08-12"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 3,
    "customer_id": "CUST018",
    "rating_date": "2024-08-13"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4.2,
    "customer_id": "CUST019",
    "rating_date": "2024-08-14"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 4.1,
    "customer_id": "CUST020",
    "rating_date": "2024-08-15"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 3.7,
    "customer_id": "CUST021",
    "rating_date": "2024-08-16"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 3.9,
    "customer_id": "CUST022",
    "rating_date": "2024-08-17"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 4.4,
    "customer_id": "CUST023",
    "rating_date": "2024-08-18"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 3.6,
    "customer_id": "CUST024",
    "rating_date": "2024-08-19"
  },
  {
    "flavor": "Mocha Bean",
    "rating": null,
    "customer_id": "CUST025",
    "rating_date": "2024-08-20"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4.8,
    "customer_id": "CUST026",
    "rating_date": "2024-08-21"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 4.6,
    "customer_id": "CUST027",
    "rating_date": "2024-08-22"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 4,
    "customer_id": "CUST028",
    "rating_date": "2024-08-23"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4.4,
    "customer_id": "CUST029",
    "rating_date": "2024-08-24"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 3.2,
    "customer_id": "CUST030",
    "rating_date": "2024-11-25"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4.9,
    "customer_id": "CUST031",
    "rating_date": "2024-11-26"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 4.1,
    "customer_id": "CUST032",
    "rating_date": "2024-11-27"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 3.3,
    "customer_id": "CUST033",
    "rating_date": "2024-11-28"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 3.8,
    "customer_id": "CUST034",
    "rating_date": "2024-11-29"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 4,
    "customer_id": "CUST035",
    "rating_date": "2024-11-30"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4.3,
    "customer_id": "CUST036",
    "rating_date": "2024-12-01"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": null,
    "customer_id": "CUST037",
    "rating_date": "2024-12-02"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 3.7,
    "customer_id": "CUST038",
    "rating_date": "2024-12-03"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4.5,
    "customer_id": "CUST039",
    "rating_date": "2024-12-04"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 3.9,
    "customer_id": "CUST040",
    "rating_date": "2024-12-05"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4.4,
    "customer_id": "CUST041",
    "rating_date": "2024-12-06"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 3.5,
    "customer_id": "CUST042",
    "rating_date": "2024-12-07"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 4.6,
    "customer_id": "CUST043",
    "rating_date": "2024-12-08"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4.2,
    "customer_id": "CUST044",
    "rating_date": "2025-02-09"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 3.4,
    "customer_id": "CUST045",
    "rating_date": "2025-02-10"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": null,
    "customer_id": "CUST046",
    "rating_date": "2025-02-11"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 4,
    "customer_id": "CUST047",
    "rating_date": "2025-02-12"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 4.1,
    "customer_id": "CUST048",
    "rating_date": "2025-02-13"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4.3,
    "customer_id": "CUST049",
    "rating_date": "2025-04-14"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 3.7,
    "customer_id": "CUST050",
    "rating_date": "2025-04-15"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4.6,
    "customer_id": "CUST051",
    "rating_date": "2025-04-16"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 4.3,
    "customer_id": "CUST052",
    "rating_date": "2025-04-17"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 3.8,
    "customer_id": "CUST053",
    "rating_date": "2025-04-18"
  },
  {
    "flavor": "Caramel Delight",
    "rating": null,
    "customer_id": "CUST054",
    "rating_date": "2025-06-19"
  },
  {
    "flavor": "Mocha Bean",
    "rating": 4.7,
    "customer_id": "CUST055",
    "rating_date": "2025-06-20"
  },
  {
    "flavor": "Classic Chocolate",
    "rating": 4,
    "customer_id": "CUST056",
    "rating_date": "2025-06-21"
  },
  {
    "flavor": "Strawberry Swirl",
    "rating": 4.2,
    "customer_id": "CUST057",
    "rating_date": "2025-06-22"
  },
  {
    "flavor": "Vanilla Bean",
    "rating": 3.6,
    "customer_id": "CUST058",
    "rating_date": "2025-06-23"
  },
  {
    "flavor": "Caramel Delight",
    "rating": 4,
    "customer_id": "CUST059",
    "rating_date": "2025-06-24"
  }
]
milkshake_ratings = pd.DataFrame(milkshake_ratings_data)


## Question 1

There was an error in our data collection process, and we unknowingly introduced duplciate rows into our data. Remove any duplicate entries in the customer ratings data to ensure the accuracy of the analysis.

In [None]:
import pandas as pd

# Create the dataset
data = [
    ["Classic Chocolate", 4.5, "CUST001", "2024-07-05"],
    ["Strawberry Swirl", 3.8, "CUST002", "2024-07-10"],
    ["Vanilla Bean", 4.2, "CUST003", "2024-07-15"],
    ["Caramel Delight", 3.5, "CUST004", "2024-07-20"],
    ["Mocha Bean", None, "CUST005", "2024-07-25"],
    ["Classic Chocolate", 4.5, "CUST001", "2024-07-05"],  # duplicate
    ["Classic Chocolate", 5, "CUST006", "2024-08-01"],
    ["Strawberry Swirl", 4, "CUST007", "2024-08-02"],
    ["Vanilla Bean", 3.9, "CUST008", "2024-08-03"],
    ["Caramel Delight", 4.8, "CUST009", "2024-10-04"],
    ["Mocha Bean", 2.5, "CUST010", "2024-09-05"],
    ["Classic Chocolate", 4.7, "CUST011", "2024-10-06"],
    ["Strawberry Swirl", None, "CUST012", "2024-10-07"],
    ["Vanilla Bean", 4.3, "CUST013", "2024-10-08"],
    ["Caramel Delight", 4.9, "CUST014", "2024-10-09"],
    ["Mocha Bean", 3.3, "CUST015", "2024-08-10"],
    ["Classic Chocolate", 1, "CUST016", "2024-08-11"],
    ["Strawberry Swirl", 6, "CUST017", "2024-08-12"],
    ["Vanilla Bean", 3, "CUST018", "2024-08-13"],
    ["Caramel Delight", 4.2, "CUST019", "2024-08-14"],
    ["Mocha Bean", 4.1, "CUST020", "2024-08-15"],
    ["Classic Chocolate", 3.7, "CUST021", "2024-08-16"],
    ["Strawberry Swirl", 3.9, "CUST022", "2024-08-17"],
    ["Vanilla Bean", 4.4, "CUST023", "2024-08-18"],
    ["Caramel Delight", 3.6, "CUST024", "2024-08-19"],
    ["Mocha Bean", None, "CUST025", "2024-08-20"],
    ["Classic Chocolate", 4.8, "CUST026", "2024-08-21"],
    ["Strawberry Swirl", 4.6, "CUST027", "2024-08-22"],
    ["Vanilla Bean", 4, "CUST028", "2024-08-23"],
    ["Caramel Delight", 4.4, "CUST029", "2024-08-24"],
    ["Mocha Bean", 3.2, "CUST030", "2024-11-25"],
    ["Classic Chocolate", 4.9, "CUST031", "2024-11-26"],
    ["Strawberry Swirl", 4.1, "CUST032", "2024-11-27"],
    ["Vanilla Bean", 3.3, "CUST033", "2024-11-28"],
    ["Caramel Delight", 3.8, "CUST034", "2024-11-29"],
    ["Mocha Bean", 4, "CUST035", "2024-11-30"],
    ["Classic Chocolate", 4.3, "CUST036", "2024-12-01"],
    ["Strawberry Swirl", None, "CUST037", "2024-12-02"],
    ["Vanilla Bean", 3.7, "CUST038", "2024-12-03"],
    ["Caramel Delight", 4.5, "CUST039", "2024-12-04"],
    ["Mocha Bean", 3.9, "CUST040", "2024-12-05"],
    ["Classic Chocolate", 4.4, "CUST041", "2024-12-06"],
    ["Strawberry Swirl", 3.5, "CUST042", "2024-12-07"],
    ["Vanilla Bean", 4.6, "CUST043", "2024-12-08"],
    ["Caramel Delight", 4.2, "CUST044", "2025-02-09"],
    ["Mocha Bean", 3.4, "CUST045", "2025-02-10"],
    ["Classic Chocolate", None, "CUST046", "2025-02-11"],
    ["Strawberry Swirl", 4, "CUST047", "2025-02-12"],
    ["Vanilla Bean", 4.1, "CUST048", "2025-02-13"],
    ["Caramel Delight", 4.3, "CUST049", "2025-04-14"],
    ["Mocha Bean", 3.7, "CUST050", "2025-04-15"],
    ["Classic Chocolate", 4.6, "CUST051", "2025-04-16"],
    ["Strawberry Swirl", 4.3, "CUST052", "2025-04-17"],
    ["Vanilla Bean", 3.8, "CUST053", "2025-04-18"],
    ["Caramel Delight", None, "CUST054", "2025-06-19"],
    ["Mocha Bean", 4.7, "CUST055", "2025-06-20"],
    ["Classic Chocolate", 4, "CUST056", "2025-06-21"],
    ["Strawberry Swirl", 4.2, "CUST057", "2025-06-22"],
    ["Vanilla Bean", 3.6, "CUST058", "2025-06-23"],
    ["Caramel Delight", 4, "CUST059", "2025-06-24"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=["flavor", "rating", "customer_id", "rating_date"])

# Remove duplicates
df_no_duplicates = df.drop_duplicates()

print("Original number of rows:", len(df))
print("Number of rows after removing duplicates:", len(df_no_duplicates))
print("\nCleaned dataset:")
print(df_no_duplicates)

## Question 2

For each milkshake flavor, calculate the average customer rating and append this as a new column to the milkshake_ratings DataFrame. Don't forget to clean the DataFrame first by dropping duplicate values.

In [None]:
import pandas as pd

# Original dataset
data = [
    ["Classic Chocolate", 4.5, "CUST001", "2024-07-05"],
    ["Strawberry Swirl", 3.8, "CUST002", "2024-07-10"],
    ["Vanilla Bean", 4.2, "CUST003", "2024-07-15"],
    ["Caramel Delight", 3.5, "CUST004", "2024-07-20"],
    ["Mocha Bean", None, "CUST005", "2024-07-25"],
    ["Classic Chocolate", 4.5, "CUST001", "2024-07-05"],  # duplicate
    ["Classic Chocolate", 5, "CUST006", "2024-08-01"],
    ["Strawberry Swirl", 4, "CUST007", "2024-08-02"],
    ["Vanilla Bean", 3.9, "CUST008", "2024-08-03"],
    ["Caramel Delight", 4.8, "CUST009", "2024-10-04"],
    ["Mocha Bean", 2.5, "CUST010", "2024-09-05"],
    ["Classic Chocolate", 4.7, "CUST011", "2024-10-06"],
    ["Strawberry Swirl", None, "CUST012", "2024-10-07"],
    ["Vanilla Bean", 4.3, "CUST013", "2024-10-08"],
    ["Caramel Delight", 4.9, "CUST014", "2024-10-09"],
    ["Mocha Bean", 3.3, "CUST015", "2024-08-10"],
    ["Classic Chocolate", 1, "CUST016", "2024-08-11"],
    ["Strawberry Swirl", 6, "CUST017", "2024-08-12"],
    ["Vanilla Bean", 3, "CUST018", "2024-08-13"],
    ["Caramel Delight", 4.2, "CUST019", "2024-08-14"],
    ["Mocha Bean", 4.1, "CUST020", "2024-08-15"],
    ["Classic Chocolate", 3.7, "CUST021", "2024-08-16"],
    ["Strawberry Swirl", 3.9, "CUST022", "2024-08-17"],
    ["Vanilla Bean", 4.4, "CUST023", "2024-08-18"],
    ["Caramel Delight", 3.6, "CUST024", "2024-08-19"],
    ["Mocha Bean", None, "CUST025", "2024-08-20"],
    ["Classic Chocolate", 4.8, "CUST026", "2024-08-21"],
    ["Strawberry Swirl", 4.6, "CUST027", "2024-08-22"],
    ["Vanilla Bean", 4, "CUST028", "2024-08-23"],
    ["Caramel Delight", 4.4, "CUST029", "2024-08-24"],
    ["Mocha Bean", 3.2, "CUST030", "2024-11-25"],
    ["Classic Chocolate", 4.9, "CUST031", "2024-11-26"],
    ["Strawberry Swirl", 4.1, "CUST032", "2024-11-27"],
    ["Vanilla Bean", 3.3, "CUST033", "2024-11-28"],
    ["Caramel Delight", 3.8, "CUST034", "2024-11-29"],
    ["Mocha Bean", 4, "CUST035", "2024-11-30"],
    ["Classic Chocolate", 4.3, "CUST036", "2024-12-01"],
    ["Strawberry Swirl", None, "CUST037", "2024-12-02"],
    ["Vanilla Bean", 3.7, "CUST038", "2024-12-03"],
    ["Caramel Delight", 4.5, "CUST039", "2024-12-04"],
    ["Mocha Bean", 3.9, "CUST040", "2024-12-05"],
    ["Classic Chocolate", 4.4, "CUST041", "2024-12-06"],
    ["Strawberry Swirl", 3.5, "CUST042", "2024-12-07"],
    ["Vanilla Bean", 4.6, "CUST043", "2024-12-08"],
    ["Caramel Delight", 4.2, "CUST044", "2025-02-09"],
    ["Mocha Bean", 3.4, "CUST045", "2025-02-10"],
    ["Classic Chocolate", None, "CUST046", "2025-02-11"],
    ["Strawberry Swirl", 4, "CUST047", "2025-02-12"],
    ["Vanilla Bean", 4.1, "CUST048", "2025-02-13"],
    ["Caramel Delight", 4.3, "CUST049", "2025-04-14"],
    ["Mocha Bean", 3.7, "CUST050", "2025-04-15"],
    ["Classic Chocolate", 4.6, "CUST051", "2025-04-16"],
    ["Strawberry Swirl", 4.3, "CUST052", "2025-04-17"],
    ["Vanilla Bean", 3.8, "CUST053", "2025-04-18"],
    ["Caramel Delight", None, "CUST054", "2025-06-19"],
    ["Mocha Bean", 4.7, "CUST055", "2025-06-20"],
    ["Classic Chocolate", 4, "CUST056", "2025-06-21"],
    ["Strawberry Swirl", 4.2, "CUST057", "2025-06-22"],
    ["Vanilla Bean", 3.6, "CUST058", "2025-06-23"],
    ["Caramel Delight", 4, "CUST059", "2025-06-24"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=["flavor", "rating", "customer_id", "rating_date"])

# 1. Remove exact duplicates
df_clean = df.drop_duplicates()

# 2. Calculate average rating for each flavor (ignoring NaN)
avg_ratings = df_clean.groupby("flavor", dropna=False)["rating"].mean()

# 3. Append average rating as a new column
df_clean["average_rating"] = df_clean["flavor"].map(avg_ratings)

print(df_clean)

## Question 3

For each row in dataset, calculate the difference between that customer's rating and the average rating for the flavor. Don't forget to clean the DataFrame first by dropping duplicate values.

In [None]:
import pandas as pd

# Dataset
data = [
    ["Classic Chocolate", 4.5, "CUST001", "2024-07-05"],
    ["Strawberry Swirl", 3.8, "CUST002", "2024-07-10"],
    ["Vanilla Bean", 4.2, "CUST003", "2024-07-15"],
    ["Caramel Delight", 3.5, "CUST004", "2024-07-20"],
    ["Mocha Bean", None, "CUST005", "2024-07-25"],
    ["Classic Chocolate", 4.5, "CUST001", "2024-07-05"],  # duplicate
    ["Classic Chocolate", 5, "CUST006", "2024-08-01"],
    ["Strawberry Swirl", 4, "CUST007", "2024-08-02"],
    ["Vanilla Bean", 3.9, "CUST008", "2024-08-03"],
    ["Caramel Delight", 4.8, "CUST009", "2024-10-04"],
    ["Mocha Bean", 2.5, "CUST010", "2024-09-05"],
    ["Classic Chocolate", 4.7, "CUST011", "2024-10-06"],
    ["Strawberry Swirl", None, "CUST012", "2024-10-07"],
    ["Vanilla Bean", 4.3, "CUST013", "2024-10-08"],
    ["Caramel Delight", 4.9, "CUST014", "2024-10-09"],
    ["Mocha Bean", 3.3, "CUST015", "2024-08-10"],
    ["Classic Chocolate", 1, "CUST016", "2024-08-11"],
    ["Strawberry Swirl", 6, "CUST017", "2024-08-12"],
    ["Vanilla Bean", 3, "CUST018", "2024-08-13"],
    ["Caramel Delight", 4.2, "CUST019", "2024-08-14"],
    ["Mocha Bean", 4.1, "CUST020", "2024-08-15"],
    ["Classic Chocolate", 3.7, "CUST021", "2024-08-16"],
    ["Strawberry Swirl", 3.9, "CUST022", "2024-08-17"],
    ["Vanilla Bean", 4.4, "CUST023", "2024-08-18"],
    ["Caramel Delight", 3.6, "CUST024", "2024-08-19"],
    ["Mocha Bean", None, "CUST025", "2024-08-20"],
    ["Classic Chocolate", 4.8, "CUST026", "2024-08-21"],
    ["Strawberry Swirl", 4.6, "CUST027", "2024-08-22"],
    ["Vanilla Bean", 4, "CUST028", "2024-08-23"],
    ["Caramel Delight", 4.4, "CUST029", "2024-08-24"],
    ["Mocha Bean", 3.2, "CUST030", "2024-11-25"],
    ["Classic Chocolate", 4.9, "CUST031", "2024-11-26"],
    ["Strawberry Swirl", 4.1, "CUST032", "2024-11-27"],
    ["Vanilla Bean", 3.3, "CUST033", "2024-11-28"],
    ["Caramel Delight", 3.8, "CUST034", "2024-11-29"],
    ["Mocha Bean", 4, "CUST035", "2024-11-30"],
    ["Classic Chocolate", 4.3, "CUST036", "2024-12-01"],
    ["Strawberry Swirl", None, "CUST037", "2024-12-02"],
    ["Vanilla Bean", 3.7, "CUST038", "2024-12-03"],
    ["Caramel Delight", 4.5, "CUST039", "2024-12-04"],
    ["Mocha Bean", 3.9, "CUST040", "2024-12-05"],
    ["Classic Chocolate", 4.4, "CUST041", "2024-12-06"],
    ["Strawberry Swirl", 3.5, "CUST042", "2024-12-07"],
    ["Vanilla Bean", 4.6, "CUST043", "2024-12-08"],
    ["Caramel Delight", 4.2, "CUST044", "2025-02-09"],
    ["Mocha Bean", 3.4, "CUST045", "2025-02-10"],
    ["Classic Chocolate", None, "CUST046", "2025-02-11"],
    ["Strawberry Swirl", 4, "CUST047", "2025-02-12"],
    ["Vanilla Bean", 4.1, "CUST048", "2025-02-13"],
    ["Caramel Delight", 4.3, "CUST049", "2025-04-14"],
    ["Mocha Bean", 3.7, "CUST050", "2025-04-15"],
    ["Classic Chocolate", 4.6, "CUST051", "2025-04-16"],
    ["Strawberry Swirl", 4.3, "CUST052", "2025-04-17"],
    ["Vanilla Bean", 3.8, "CUST053", "2025-04-18"],
    ["Caramel Delight", None, "CUST054", "2025-06-19"],
    ["Mocha Bean", 4.7, "CUST055", "2025-06-20"],
    ["Classic Chocolate", 4, "CUST056", "2025-06-21"],
    ["Strawberry Swirl", 4.2, "CUST057", "2025-06-22"],
    ["Vanilla Bean", 3.6, "CUST058", "2025-06-23"],
    ["Caramel Delight", 4, "CUST059", "2025-06-24"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=["flavor", "rating", "customer_id", "rating_date"])

# Step 1: Remove duplicates
df_clean = df.drop_duplicates()

# Step 2: Calculate average ratings per flavor
avg_ratings = df_clean.groupby("flavor", dropna=False)["rating"].mean().reset_index()
avg_ratings.columns = ["flavor", "average_rating"]

# Step 3: Merge average ratings onto cleaned DataFrame
df_with_avg = pd.merge(df_clean, avg_ratings, on="flavor", how="left")

# Step 4: Calculate difference from average
df_with_avg["rating_difference"] = df_with_avg["rating"] - df_with_avg["average_rating"]

print(df_with_avg)

Made with ❤️ by [Interview Master](https://www.interviewmaster.ai)