Task 1: Generate the raw dataset using fixed rules

In [1]:
import numpy as np
seed_value = 1107
n = 320
rng = np.random.default_rng(seed_value)

tickets = []
routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days_list = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
classes = ["economy", "premium", "business"]

route_adj_list = [140, 220, 60, 180, 80]
class_adj_list = [0, 80, 220]

for i in range(1, n + 1):

    route_index = (i + seed_value) % 5
    day_index = (i + seed_value) % 7
    class_index = (i * 2 + seed_value) % 3

    days_to_departure = 1 + ((i * 3 + seed_value) % 60)

    base = 120 + (days_to_departure * -1.5)
    route_adj = route_adj_list[route_index]
    class_adj = class_adj_list[class_index]
    noise = rng.normal(0, 25)

    price_usd = round(base + route_adj + class_adj + noise, 2)

    if i % 28 == 0:
        price_usd = ""

    if i % 45 == 0 and price_usd != "":
        price_usd *= -1

    ticket_class = classes[class_index]
    if i % 37 == 0:
        ticket_class = ticket_class.upper()

    ticket = {
        "ticket_id": f"T{seed_value}-{i:04d}",
        "route": routes[route_index],
        "day": days_list[day_index],
        "days_to_departure": days_to_departure,
        "class": ticket_class,
        "price_usd": price_usd
    }

    tickets.append(ticket)

print("Total records:", len(tickets))
print("First 5 records:")
for t in tickets[:5]:
    print(t)

Total records: 320
First 5 records:
{'ticket_id': 'T1107-0001', 'route': 'DXB-SIN', 'day': 'Wed', 'days_to_departure': 31, 'class': 'business', 'price_usd': 504.63}
{'ticket_id': 'T1107-0002', 'route': 'MAD-ROM', 'day': 'Thu', 'days_to_departure': 34, 'class': 'premium', 'price_usd': 226.83}
{'ticket_id': 'T1107-0003', 'route': 'NYC-LAX', 'day': 'Fri', 'days_to_departure': 37, 'class': 'economy', 'price_usd': 185.64}
{'ticket_id': 'T1107-0004', 'route': 'LHR-JFK', 'day': 'Sat', 'days_to_departure': 40, 'class': 'business', 'price_usd': 484.33}
{'ticket_id': 'T1107-0005', 'route': 'SFO-SEA', 'day': 'Sun', 'days_to_departure': 43, 'class': 'premium', 'price_usd': 170.05}


Task 2: Validate and clean records with core Python

In [6]:
cleaned_tickets = []

for t in tickets:
    price = t["price_usd"]
    
    if price == "" or not isinstance(price, (int, float)):
        continue
        
    if price < 0:
        continue
        
    clean_entry = t.copy()
    clean_entry["class"] = clean_entry["class"].lower()
    
    cleaned_tickets.append(clean_entry)

print(f"Total Records After Cleaning: {len(cleaned_tickets)}")
invalid_check = [t for t in cleaned_tickets if t['price_usd'] == "" or t['price_usd'] < 0]
print(f"Remaining Invalid Prices: {len(invalid_check)}")
print("Two Cleaned Records:", cleaned_tickets[:2])

Total Records After Cleaning: 302
Remaining Invalid Prices: 0
Two Cleaned Records: [{'ticket_id': 'T1107-0001', 'route': 'DXB-SIN', 'day': 'Wed', 'days_to_departure': 31, 'class': 'business', 'price_usd': 504.63}, {'ticket_id': 'T1107-0002', 'route': 'MAD-ROM', 'day': 'Thu', 'days_to_departure': 34, 'class': 'premium', 'price_usd': 226.83}]


Task 3: Convert to NumPy for analysis


In [8]:
prices = np.array([t["price_usd"] for t in cleaned_tickets])
days = np.array([t["day"] for t in cleaned_tickets])
mean_prices = prices.mean()
std_prices = prices.std()
print("Mean price:", mean_prices)
print("Std price:", std_prices)

unique_days, day_indices = np.unique(days, return_inverse=True)

ticket_counts_per_day = np.bincount(day_indices)

total_revenue_per_day = np.bincount(day_indices, weights=prices)

total_revenue = prices.sum()

daily_sum = total_revenue_per_day.sum()


print("Days:", unique_days)
print("Counts:", ticket_counts_per_day)
print("Revenue:", total_revenue_per_day)

Mean price: 314.04738410596025
Std price: 114.36429809278958
Days: ['Fri' 'Mon' 'Sat' 'Sun' 'Thu' 'Tue' 'Wed']
Counts: [45 44 45 45 45 33 45]
Revenue: [13957.62 13971.04 14595.35 14187.54 13899.34 10014.96 14216.46]


In [4]:
threshold = np.percentile(prices, 90)
high_price_mask = prices >= threshold
high_price_count = np.sum(high_price_mask)

print("90th percentile threshold:", threshold)
print("Number of high-price tickets:", high_price_count)

assert np.all(prices[high_price_mask] >= threshold)

90th percentile threshold: 482.04100000000005
Number of high-price tickets: 31


Task 5: Produce a final report

In [5]:
report = {
    "total_tickets": len(tickets),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": mean_prices,
    "std_price": std_prices,
    "daily_totals": dict(zip(unique_days, total_revenue_per_day)),
    "high_price_count": high_price_count
}

print("Flight Tickets Report:")
print("=====================")
for key, value in report.items():
    print(f"{key}: {value}")

assert np.isclose(total_revenue_per_day.sum(), prices.sum())

Flight Tickets Report:
total_tickets: 320
cleaned_tickets: 302
mean_price: 314.04738410596025
std_price: 114.36429809278958
daily_totals: {np.str_('Fri'): np.float64(13957.619999999995), np.str_('Mon'): np.float64(13971.039999999999), np.str_('Sat'): np.float64(14595.35), np.str_('Sun'): np.float64(14187.539999999997), np.str_('Thu'): np.float64(13899.339999999997), np.str_('Tue'): np.float64(10014.960000000001), np.str_('Wed'): np.float64(14216.460000000003)}
high_price_count: 31
