In [2]:
import numpy as np

In [7]:
seed_value=2411
n=320
rng = np.random.default_rng(seed_value)



In [10]:
routes = ["NYC-LAX","LHR-JFK","SFO-SEA","DXB-SIN","MAD-ROM"]
days = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
classes = ["economy","premium","business"]
route_adj_list = [140,220,60,180,80]
class_adj_list = [0,80,220]

tickets = [] 
for i in range(1,n+1): 
    route_idx = (i + seed_value) % 5
    day_idx = (i + seed_value) % 7
    class_idx = (i + seed_value) % 3

    ticket_id = f"T{seed_value}-{i:04d}"
    route = routes[route_idx]
    day = days[day_idx]
    days_to_departure = 1 + ((i*3+seed_value) % 60)
    ticket_class = classes[class_idx]

    base = 120-(days_to_departure * -1.5)
    route_adj = route_adj_list[route_idx]
    class_adj = class_adj_list[class_idx]
    
    noise = float(rng.normal(0,25))
    price_usd = round(base + noise + route_adj + class_adj, 2 )

    if  i % 28 == 0:
        price_usd =""
    if i % 45 == 0:
        if isinstance(price_usd,(int, float)):
            price_usd = price_usd * -1
    if i % 37 == 0:
        ticket_class = ticket_class.upper()
    tickets.append({
    "ticket_id": ticket_id,
    "route": route,
    "day": day,
    "days_to_departure": days_to_departure,
    "class": ticket_class,
    "price_usd": price_usd
})

print(f"total numbers: {len(tickets)}\n")
print("first 5 entries: ")
for ticket in tickets[:5]:
    print(ticket)
    




total numbers: 320

first 5 entries: 
{'ticket_id': 'T2411-0001', 'route': 'SFO-SEA', 'day': 'Fri', 'days_to_departure': 15, 'class': 'economy', 'price_usd': 165.73}
{'ticket_id': 'T2411-0002', 'route': 'DXB-SIN', 'day': 'Sat', 'days_to_departure': 18, 'class': 'premium', 'price_usd': 411.84}
{'ticket_id': 'T2411-0003', 'route': 'MAD-ROM', 'day': 'Sun', 'days_to_departure': 21, 'class': 'business', 'price_usd': 460.31}
{'ticket_id': 'T2411-0004', 'route': 'NYC-LAX', 'day': 'Mon', 'days_to_departure': 24, 'class': 'economy', 'price_usd': 264.72}
{'ticket_id': 'T2411-0005', 'route': 'LHR-JFK', 'day': 'Tue', 'days_to_departure': 27, 'class': 'premium', 'price_usd': 508.53}


In [None]:
cleaned_tickets = []

for t in tickets:
    price = t["price_usd"]
    if price == "" or not isinstance(price,(int,float)) or price<0:
        continue
    t["class"] = t["class"].lower()

    cleaned_tickets.append(t)

invalid_prices_left = sum(1 for t in cleaned_tickets if t["price_usd"] == "" or t["price_usd"] < 0) 

print(f"Cleaned tickets count: {len(cleaned_tickets)}")
print(f"Invalid prices remaining: {invalid_prices_left}")
print("Sample cleaned records:")
print(cleaned_tickets[:2])
print("\n")



Cleaned tickets count: 302
Invalid prices remaining: 0
Sample cleaned records:
[{'ticket_id': 'T2411-0001', 'route': 'SFO-SEA', 'day': 'Fri', 'days_to_departure': 15, 'class': 'economy', 'price_usd': 165.73}, {'ticket_id': 'T2411-0002', 'route': 'DXB-SIN', 'day': 'Sat', 'days_to_departure': 18, 'class': 'premium', 'price_usd': 411.84}]




In [None]:
prices = np.array([t["price_usd"] for t in cleaned_tickets])
days = np.array([t["day"] for t in cleaned_tickets])

mean_price = np.mean(prices)
std_price = np.std(prices)

unique_days, day_indices = np.unique(days, return_inverse=True)
daily_counts = np.bincount(day_indices)
daily_revenue = np.bincount(day_indices, weights= prices)

daily_totals = {
    day: {"count": int(cnt),"revenue": round(rev,2)}
    for day, cnt, rev in zip(unique_days, daily_counts, daily_revenue)
}

total_revenue_overall = np.sum(prices)
total_revenue_daily = np.sum(daily_revenue)
revenue_match = np.isclose(total_revenue_overall, total_revenue_daily)

print(f"Revenue validation match (Overall == Daily Sums): {revenue_match}\n")

Revenue validation match (Overall == Daily Sums): True



In [24]:
threshold = np.quantile(prices,0.9)

high_prices = prices[prices >= threshold]
high_price_count = len(high_prices)

all_valid = np.all(high_prices >= threshold)
print(f"90th Percentile Threshold: ${threshold:.2f}")
print(f"Verification (All high prices >= threshold): {all_valid}\n")

90th Percentile Threshold: $551.70
Verification (All high prices >= threshold): True



In [25]:
report = {
    "total_tickets": len(tickets),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": round(mean_price, 2),
    "std_price": round(std_price, 2),
    "daily_totals": daily_totals,
    "high_price_count": high_price_count
}

print("--- FINAL REPORT ---")
for key, value in report.items():
    if key == "daily_totals":
        print(f"{key}:")
        for day, stats in value.items():
            print(f"  {day}: {stats}")
    else:
        print(f"{key}: {value}")

print(f"\nExplicit Validation: The sum of the daily revenue arrays (${total_revenue_daily:,.2f}) perfectly matches the overall total revenue sum (${total_revenue_overall:,.2f}), confirming no data was dropped during the vectorized aggregation.")

--- FINAL REPORT ---
total_tickets: 320
cleaned_tickets: 302
mean_price: 398.22
std_price: 110.62
daily_totals:
  Fri: {'count': 45, 'revenue': np.float64(17780.92)}
  Mon: {'count': 45, 'revenue': np.float64(17844.48)}
  Sat: {'count': 45, 'revenue': np.float64(17836.3)}
  Sun: {'count': 45, 'revenue': np.float64(17953.36)}
  Thu: {'count': 33, 'revenue': np.float64(13446.23)}
  Tue: {'count': 45, 'revenue': np.float64(17850.73)}
  Wed: {'count': 44, 'revenue': np.float64(17548.94)}
high_price_count: 31

Explicit Validation: The sum of the daily revenue arrays ($120,260.96) perfectly matches the overall total revenue sum ($120,260.96), confirming no data was dropped during the vectorized aggregation.
