In [2]:
# 1 
import numpy as np
seed_value=212
n=320
rng=np.random.default_rng(seed_value)
tickets=[]
for i in range(1,n+1):
    ticket={}
    ticket["ticket_id"]=f"T{seed_value}-{i:04d}"
    routes=["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"] 
    route_index=(i + seed_value) % 5
    ticket["route"]=routes[route_index]
    days=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    day_index=(i + seed_value) % 7
    ticket["day"]=days[day_index]
    days_to_departure=1 + ((i * 3 + seed_value) % 60)
    ticket["days_to_departure"]=days_to_departure
    classes=["economy", "premium", "business"]
    class_index=(i * 2 + seed_value) % 3
    class_alias=classes[class_index]
    base = 120 + (days_to_departure * -1.5)
    route_adj = [140, 220, 60, 180, 80]
    class_adj = [0, 80, 220]
    noise = rng.normal(0, 25)
    price_usd = round(base + route_adj[route_index] + class_adj[class_index] + noise, 2)
    if i % 28 == 0:
        price_usd=""
    if i % 45 == 0 and price_usd!="":
        price_usd*=-1
    if i % 37 == 0:
        class_alias=class_alias.upper()              
    ticket["class"]=class_alias
    ticket["price_usd"]=price_usd
    tickets.append(ticket)
print(len(tickets))
print(tickets[:5])

320
[{'ticket_id': 'T212-0001', 'route': 'DXB-SIN', 'day': 'Thu', 'days_to_departure': 36, 'class': 'premium', 'price_usd': 316.02}, {'ticket_id': 'T212-0002', 'route': 'MAD-ROM', 'day': 'Fri', 'days_to_departure': 39, 'class': 'economy', 'price_usd': 106.31}, {'ticket_id': 'T212-0003', 'route': 'NYC-LAX', 'day': 'Sat', 'days_to_departure': 42, 'class': 'business', 'price_usd': 447.35}, {'ticket_id': 'T212-0004', 'route': 'LHR-JFK', 'day': 'Sun', 'days_to_departure': 45, 'class': 'premium', 'price_usd': 360.26}, {'ticket_id': 'T212-0005', 'route': 'SFO-SEA', 'day': 'Mon', 'days_to_departure': 48, 'class': 'economy', 'price_usd': 136.38}]


In [3]:
# 2 
def clean_tickets(tickets):
    cleaned_tickets=[]
    for ticket in tickets:
        price=ticket["price_usd"]
        if not isinstance(price, (int, float)) or price<0:
            continue
        new_ticket = ticket.copy()
        new_ticket["class"] = new_ticket["class"].lower()
        cleaned_tickets.append(new_ticket)
    return cleaned_tickets
cleaned=clean_tickets(tickets)
print(len(cleaned))
print(cleaned[:2])

302
[{'ticket_id': 'T212-0001', 'route': 'DXB-SIN', 'day': 'Thu', 'days_to_departure': 36, 'class': 'premium', 'price_usd': 316.02}, {'ticket_id': 'T212-0002', 'route': 'MAD-ROM', 'day': 'Fri', 'days_to_departure': 39, 'class': 'economy', 'price_usd': 106.31}]


In [4]:
# 3
prices=np.array([ticket["price_usd"] for ticket in cleaned], dtype=float)
days=np.array([ticket["day"] for ticket in cleaned], dtype=str)
mean_price=prices.mean()
std_price=prices.std()
days_names=np.array([ticket["day"] for ticket in cleaned])
unique_days, indices=np.unique(days_names, return_inverse=True)
indices = indices.flatten().astype(int)
ticket_counts_per_day=np.bincount(indices)
revenue_per_day=np.bincount(indices,weights=prices)
total_revenue = prices.sum()
daily_sum_check = revenue_per_day.sum()
is_valid = np.isclose(total_revenue, daily_sum_check)
print(f"Validation: {is_valid}")

Validation: True


In [5]:
# 4
threshold=np.percentile(prices, 90)
high_price_tickets=prices[prices>threshold]
print(high_price_tickets)
daily_totals = dict(zip(unique_days, np.round(revenue_per_day, 2)))

[515.88 493.17 535.48 533.17 470.58 486.17 493.54 478.07 499.37 478.47
 508.69 580.8  514.35 504.19 569.62 530.42 482.05 473.5  481.15 489.76
 528.77 490.81 526.61 469.84 529.44 484.94 495.73 504.89 510.48 559.05
 473.89]


In [6]:
# 5
report={"total_tickets":len(tickets),"cleaned_tickets":len(cleaned), "mean_price":round(float(mean_price), 2), "std_price":round(float(std_price), 2), "daily_totals": daily_totals, "high_price_count":len(high_price_tickets)}
import json
readable_report=json.dumps(report, indent=4)
print("--REPORT--")
print(readable_report)
print("\n--VALIDATION CHECKS--")
print(f"1. All daily totals match overall revenue: {is_valid}")
print(f"2. No negative prices in cleaned data: {np.all(prices >= 0)}")
print(f"3. All high-price tickets are above threshold: {np.all(prices[prices >= threshold] >= threshold)}")

--REPORT--
{
    "total_tickets": 320,
    "cleaned_tickets": 302,
    "mean_price": 308.15,
    "std_price": 113.04,
    "daily_totals": {
        "Fri": 13351.49,
        "Mon": 13831.13,
        "Sat": 14375.35,
        "Sun": 13854.0,
        "Thu": 13681.53,
        "Tue": 13566.97,
        "Wed": 10399.85
    },
    "high_price_count": 31
}

--VALIDATION CHECKS--
1. All daily totals match overall revenue: True
2. No negative prices in cleaned data: True
3. All high-price tickets are above threshold: True
