In [1]:
import numpy as np

seed_value = 308
n = 320
rng = np.random.default_rng(seed_value)

In [2]:
tickets = []
routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
classes = ["economy", "premium", "business"]
route_adj = [140, 220, 60, 180, 80]
class_adj = [0, 80, 220]

for i in range(1, n + 1):
    ticket_id = f"T{seed_value}-{i:04d}"
    route_index = (i + seed_value) % 5
    route = routes[route_index]
    day = days[(i + seed_value) % 7]
    days_to_departure = 1 + ((i * 3 + seed_value) % 60)
    class_index = (i * 2 + seed_value) % 3
    ticket_class = classes[class_index]
    base = 120 + (days_to_departure * -1.5)
    
    route_price = route_adj[route_index]
    class_price = class_adj[class_index]
    noise = rng.normal(0, 25)
    price_usd = round(base + route_price + class_price + noise, 2)
    
    if i % 28 == 0:
        price_usd = ""

    if i % 45 == 0:
        price_usd = price_usd * -1

    if i % 37 == 0:
        ticket_class = ticket_class.upper()

    ticket = {
    "ticket_id": ticket_id,
    "route": route,
    "day": day,
    "days_to_departure": days_to_departure,
    "class": ticket_class,
    "price_usd": price_usd
}
    tickets.append(ticket)

print(len(tickets))
print(tickets[:5])

320
[{'ticket_id': 'T308-0001', 'route': 'MAD-ROM', 'day': 'Tue', 'days_to_departure': 12, 'class': 'premium', 'price_usd': 291.58}, {'ticket_id': 'T308-0002', 'route': 'NYC-LAX', 'day': 'Wed', 'days_to_departure': 15, 'class': 'economy', 'price_usd': 244.06}, {'ticket_id': 'T308-0003', 'route': 'LHR-JFK', 'day': 'Thu', 'days_to_departure': 18, 'class': 'business', 'price_usd': 495.87}, {'ticket_id': 'T308-0004', 'route': 'SFO-SEA', 'day': 'Fri', 'days_to_departure': 21, 'class': 'premium', 'price_usd': 229.51}, {'ticket_id': 'T308-0005', 'route': 'DXB-SIN', 'day': 'Sat', 'days_to_departure': 24, 'class': 'economy', 'price_usd': 263.97}]


In [3]:
cleaned_tickets = []
for ticket in tickets:
    price = ticket["price_usd"]
    if isinstance(price, (int, float)) and price > 0:
        ticket["class"] = ticket["class"].lower()
        cleaned_tickets.append(ticket)

print("Original records:", len(tickets))
print("Cleaned records:", len(cleaned_tickets))
for ticket in cleaned_tickets:
    if not isinstance(ticket["price_usd"], (int, float)) or ticket["price_usd"] < 0:
        print("Invalid found!")
print(cleaned_tickets[:2])

Original records: 320
Cleaned records: 302
[{'ticket_id': 'T308-0001', 'route': 'MAD-ROM', 'day': 'Tue', 'days_to_departure': 12, 'class': 'premium', 'price_usd': 291.58}, {'ticket_id': 'T308-0002', 'route': 'NYC-LAX', 'day': 'Wed', 'days_to_departure': 15, 'class': 'economy', 'price_usd': 244.06}]


In [4]:
prices = np.array([ticket["price_usd"] for ticket in cleaned_tickets])
print(prices.shape)
print(prices.dtype)

(302,)
float64


In [5]:
days = np.array([ticket["day"] for ticket in cleaned_tickets])

mean_price = np.mean(prices)
std_price = np.std(prices)

print("Mean price:", round(mean_price, 2))
print("Std deviation:", round(std_price, 2))

Mean price: 306.2
Std deviation: 113.46


In [6]:
unique_days = np.unique(days)
prices[days == "Mon"]
revenue_per_day = {
    day: prices[days == day].sum()
    for day in unique_days
}
tickets_per_day = {
    day: np.sum(days == day)
    for day in unique_days
}


In [7]:
total_revenue = prices.sum()
daily_total_sum = sum(revenue_per_day.values())
print("Overall revenue:", round(total_revenue, 2))
print("Sum of daily revenues:", round(daily_total_sum, 2))
print("Validation:", np.isclose(total_revenue, daily_total_sum))

Overall revenue: 92471.79
Sum of daily revenues: 92471.79
Validation: True


In [8]:
threshold = np.percentile(prices, 90)
high_price_mask = prices >= threshold

In [9]:
high_prices = prices[high_price_mask]
high_price_tickets = [
    ticket for ticket, flag in zip(cleaned_tickets, high_price_mask) if flag
]

In [10]:
print("90th percentile threshold:", round(threshold, 2))
print("Number of high-price tickets:", len(high_prices))

90th percentile threshold: 470.12
Number of high-price tickets: 31


In [11]:
print("Validation:", np.all(high_prices >= threshold))

Validation: True


In [12]:
report = {
    "total_tickets": len(tickets),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": round(mean_price, 2),
    "std_price": round(std_price, 2),
    "daily_totals": {day: round(total, 2) for day, total in revenue_per_day.items()},
    "high_price_count": len(high_prices)
}
validation_check = report["cleaned_tickets"] <= report["total_tickets"]
report_str = f"""
FINAL REPORT
-------------------------
Total tickets: {report['total_tickets']}
Cleaned tickets: {report['cleaned_tickets']}
Mean price: ${report['mean_price']}
Std deviation: ${report['std_price']}
High-price ticket count: {report['high_price_count']}

Daily revenue totals:
"""

for day, total in report["daily_totals"].items():
    report_str += f"  {day}: ${total}\n"

report_str += f"\nValidation (cleaned ≤ total): {validation_check}"
print(report_str)


FINAL REPORT
-------------------------
Total tickets: 320
Cleaned tickets: 302
Mean price: $306.2
Std deviation: $113.46
High-price ticket count: 31

Daily revenue totals:
  Fri: $13925.09
  Mon: $10118.8
  Sat: $13801.38
  Sun: $13351.04
  Thu: $13877.36
  Tue: $13778.71
  Wed: $13619.41

Validation (cleaned ≤ total): True
