In [None]:
import numpy as np

seed_value = 1401
rng = np.random.default_rng(seed=seed_value)

tickets = []
routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
tclasses = ["economy", "premium", "business"]
n = 320

for i in range(1, n+1):
    ticket = {}

    route_id = (i + seed_value)%5
    day_id = (i + seed_value)%7
    tclass = (i * 2 + seed_value) % 3
    daysToD = 1 + ((i * 3 + seed_value) % 60)
    base = 120 + (daysToD * -1.5)
    route_adj = [140, 220, 60, 180, 80]
    class_adj = [0, 80, 220]
    noise = rng.normal(0, 25)
    total_price = round(base + route_adj[route_id] + class_adj[tclass] + noise, 2)

    ticket["ticket_id"] = f"T{seed_value}-{i:04d}"
    ticket["route"] = routes[route_id]
    ticket["day"] = days[day_id]
    ticket["days_to_departure"] = daysToD
    ticket["class"] = tclasses[tclass]
    ticket["price_usd"] = total_price

    if i % 28 == 0:
        ticket["price_usd"] = ""
    
    if i % 45 == 0:
        ticket["price_usd"] = total_price * -1

    if i % 37 == 0:
        ticket["class"] = ticket["class"].upper()

    

    tickets.append(ticket)

print("Total records count:", len(tickets), "\n")
print(*tickets[:5], sep="\n")


In [None]:
cleaned_tickets = []

for ticket in tickets:
    price = ticket["price_usd"]

    if not isinstance(price, (int, float)):
        continue

    if price < 0:
        continue

    ticket["class"] = ticket["class"].lower()

    cleaned_tickets.append(ticket)



print("Cleaned count:", len(cleaned_tickets))
print(*cleaned_tickets[:2], sep="\n")

In [4]:
prices = np.array([t["price_usd"] for t in cleaned_tickets])
days = np.array([t["days_to_departure"] for t in cleaned_tickets])

price_mean = prices.mean()
price_std = prices.std()

unique_days, inverse_idx = np.unique(days, return_inverse=True)
revenue_per_day = np.bincount(inverse_idx, weights=prices)
counts_per_day = np.bincount(inverse_idx)
total_revenue = prices.sum()
daily_total = revenue_per_day.sum()
check = np.isclose(total_revenue, daily_total)

In [5]:

threshold = np.quantile(prices, 0.9)

high_price_tickets = prices[prices >= threshold]

count_high = len(high_price_tickets)

assert np.all(high_price_tickets >= threshold)

In [8]:
report = {
    "total_tickets": len(tickets),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": price_mean,
    "std_price": price_std,
    "daily_totals": dict(zip(unique_days, revenue_per_day)),
    "high_price_count": count_high
}

print("FINAL REPORT")
print("-" * 40)
print(f"Total tickets: {report['total_tickets']}")
print(f"Cleaned tickets: {report['cleaned_tickets']}")
print(f"Mean price: {report['mean_price']:.2f}")
print(f"Std price: {report['std_price']:.2f}")
print(f"High price tickets (>=90th percentile): {report['high_price_count']}")
print("\nDaily revenue totals:")
for day, total in report["daily_totals"].items():
    print(f"  Days to departure {day}: ${total:.2f}")


assert report["total_tickets"] == 320, "Total tickets should be 320"

assert np.isclose(
    sum(report["daily_totals"].values()),
    prices.sum()
), "Daily totals do not match overall revenue"

FINAL REPORT
----------------------------------------
Total tickets: 320
Cleaned tickets: 302
Mean price: 310.13
Std price: 113.50
High price tickets (>=90th percentile): 31

Daily revenue totals:
  Days to departure 1: $4763.52
  Days to departure 4: $5630.05
  Days to departure 7: $6129.88
  Days to departure 10: $3832.34
  Days to departure 13: $6027.86
  Days to departure 16: $4456.00
  Days to departure 19: $5503.31
  Days to departure 22: $5259.83
  Days to departure 25: $4007.08
  Days to departure 28: $5715.34
  Days to departure 31: $3769.14
  Days to departure 34: $4485.85
  Days to departure 37: $5364.30
  Days to departure 40: $3339.87
  Days to departure 43: $5399.38
  Days to departure 46: $2783.32
  Days to departure 49: $4517.04
  Days to departure 52: $5202.59
  Days to departure 55: $3280.77
  Days to departure 58: $4190.74
