In [3]:
import numpy as np

seed_value = 1905
n = 320

rng = np.random.default_rng(seed_value)

routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
classes = ["economy", "premium", "business"]

route_adj = [140, 220, 60, 180, 80]
class_adj = [0, 80, 220]

tickets = []

for i in range(1, n + 1):
    route_idx = (i + seed_value) % 5
    day_idx = (i + seed_value) % 7
    class_idx = (i * 2 + seed_value) % 3

    days_to_departure = 1 + ((i * 3 + seed_value) % 60)

    base = 120 + (days_to_departure * -1.5)
    noise = rng.normal(0, 25)

    price_usd = round(
        base + route_adj[route_idx] + class_adj[class_idx] + noise,
        2
    )

    ticket_class = classes[class_idx]

    # Inject deterministic data issues
    if i % 28 == 0:
        price_usd = ""
    if i % 45 == 0 and price_usd != "":
        price_usd *= -1
    if i % 37 == 0:
        ticket_class = ticket_class.upper()

    tickets.append({
        "ticket_id": f"T{seed_value}-{i:04d}",
        "route": routes[route_idx],
        "day": days[day_idx],
        "days_to_departure": days_to_departure,
        "class": ticket_class,
        "price_usd": price_usd
    })

# Output
print("Total records:", len(tickets))
print("First five records:")
for t in tickets[:5]:
    print(t)

Total records: 320
First five records:
{'ticket_id': 'T1905-0001', 'route': 'LHR-JFK', 'day': 'Wed', 'days_to_departure': 49, 'class': 'business', 'price_usd': 505.65}
{'ticket_id': 'T1905-0002', 'route': 'SFO-SEA', 'day': 'Thu', 'days_to_departure': 52, 'class': 'premium', 'price_usd': 160.92}
{'ticket_id': 'T1905-0003', 'route': 'DXB-SIN', 'day': 'Fri', 'days_to_departure': 55, 'class': 'economy', 'price_usd': 193.35}
{'ticket_id': 'T1905-0004', 'route': 'MAD-ROM', 'day': 'Sat', 'days_to_departure': 58, 'class': 'business', 'price_usd': 348.44}
{'ticket_id': 'T1905-0005', 'route': 'NYC-LAX', 'day': 'Sun', 'days_to_departure': 1, 'class': 'premium', 'price_usd': 351.18}


In [4]:
cleaned_tickets = []
invalid_count = 0

for t in tickets:
    price = t["price_usd"]

    # Identify invalid price
    if price == "" or not isinstance(price, (int, float)) or price < 0:
        invalid_count += 1
        continue

    # Normalize class to lowercase
    cleaned_ticket = t.copy()
    cleaned_ticket["class"] = cleaned_ticket["class"].lower()

    cleaned_tickets.append(cleaned_ticket)

# Validation checks
print("Cleaned record count:", len(cleaned_tickets))

# Verify no invalid prices remain
has_invalid = any(
    (ct["price_usd"] == "" or ct["price_usd"] < 0)
    for ct in cleaned_tickets
)
print("Invalid prices remain:", has_invalid)

# Show two cleaned records
print("Sample cleaned records:")
for ct in cleaned_tickets[:2]:
    print(ct)

Cleaned record count: 302
Invalid prices remain: False
Sample cleaned records:
{'ticket_id': 'T1905-0001', 'route': 'LHR-JFK', 'day': 'Wed', 'days_to_departure': 49, 'class': 'business', 'price_usd': 505.65}
{'ticket_id': 'T1905-0002', 'route': 'SFO-SEA', 'day': 'Thu', 'days_to_departure': 52, 'class': 'premium', 'price_usd': 160.92}


In [5]:
import numpy as np

# Convert cleaned tickets to NumPy arrays
prices = np.array([t["price_usd"] for t in cleaned_tickets], dtype=float)
days_to_departure = np.array([t["days_to_departure"] for t in cleaned_tickets], dtype=int)
days = np.array([t["day"] for t in cleaned_tickets])

# Price statistics
price_mean = prices.mean()
price_std = prices.std()

print("Mean price:", round(price_mean, 2))
print("Std dev price:", round(price_std, 2))

# Revenue and ticket counts per day (vectorized)
unique_days, inv_idx = np.unique(days, return_inverse=True)

revenue_per_day = np.bincount(inv_idx, weights=prices)
count_per_day = np.bincount(inv_idx)

# Validation: daily revenue sums to total revenue
total_revenue = prices.sum()
daily_revenue_sum = revenue_per_day.sum()

print("Total revenue:", round(total_revenue, 2))
print("Daily revenue sums match total:", np.isclose(total_revenue, daily_revenue_sum))

# Display per-day results
for d, rev, cnt in zip(unique_days, revenue_per_day, count_per_day):
    print(d, "-> revenue:", round(rev, 2), "| tickets:", cnt)

Mean price: 314.11
Std dev price: 114.87
Total revenue: 94859.99
Daily revenue sums match total: True
Fri -> revenue: 13936.18 | tickets: 45
Mon -> revenue: 14099.0 | tickets: 44
Sat -> revenue: 14345.8 | tickets: 45
Sun -> revenue: 14241.07 | tickets: 45
Thu -> revenue: 13652.83 | tickets: 45
Tue -> revenue: 10290.7 | tickets: 33
Wed -> revenue: 14294.41 | tickets: 45


In [6]:
import numpy as np

# Compute 90th percentile threshold
threshold = np.percentile(prices, 90)

# Identify high-price tickets
high_price_mask = prices >= threshold
high_price_tickets = np.array(cleaned_tickets)[high_price_mask]

# Count of high-price tickets
high_price_count = len(high_price_tickets)

# Verification
all_valid = np.all(prices[high_price_mask] >= threshold)

print("90th percentile threshold:", round(threshold, 2))
print("Number of high-price tickets:", high_price_count)
print("All selected prices >= threshold:", all_valid)

90th percentile threshold: 481.69
Number of high-price tickets: 31
All selected prices >= threshold: True


In [7]:
import numpy as np

# Recreate necessary arrays from cleaned_tickets
prices = np.array([t["price_usd"] for t in cleaned_tickets], dtype=float)
days = np.array([t["day"] for t in cleaned_tickets])

# Price statistics
price_mean = prices.mean()
price_std = prices.std()

# Revenue and ticket counts per day
unique_days, inv_idx = np.unique(days, return_inverse=True)
revenue_per_day = np.bincount(inv_idx, weights=prices)

# High-price tickets (90th percentile)
threshold = np.percentile(prices, 90)
high_price_mask = prices >= threshold
high_price_count = np.sum(high_price_mask)

# Build daily totals dictionary
daily_totals = {day: round(rev, 2) for day, rev in zip(unique_days, revenue_per_day)}

# Final report dictionary
report = {
    "total_tickets": len(tickets),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": round(price_mean, 2),
    "std_price": round(price_std, 2),
    "daily_totals": daily_totals,
    "high_price_count": high_price_count
}

# Print readable report
print("=== Flight Ticket Report ===")
print(f"Total tickets generated: {report['total_tickets']}")
print(f"Tickets after cleaning: {report['cleaned_tickets']}")
print(f"Mean ticket price: ${report['mean_price']}")
print(f"Price standard deviation: ${report['std_price']}")
print("Daily revenue totals:")
for day, total in report["daily_totals"].items():
    print(f"  {day}: ${total}")
print(f"Number of high-price tickets (90th percentile): {report['high_price_count']}")

# Explicit validation
print("\nValidation: sum of daily totals equals total revenue:",
      np.isclose(sum(daily_totals.values()), prices.sum()))

=== Flight Ticket Report ===
Total tickets generated: 320
Tickets after cleaning: 302
Mean ticket price: $314.11
Price standard deviation: $114.87
Daily revenue totals:
  Fri: $13936.18
  Mon: $14099.0
  Sat: $14345.8
  Sun: $14241.07
  Thu: $13652.83
  Tue: $10290.7
  Wed: $14294.41
Number of high-price tickets (90th percentile): 31

Validation: sum of daily totals equals total revenue: True
