In [1]:
#TASK1
import numpy as np
seed_value = 402 

n = 320
rng = np.random.default_rng(seed_value)

routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
classes = ["economy", "premium", "business"]

tickets = []

for i in range(1, n + 1):
    route_index = (i + seed_value) % 5
    day_index = (i + seed_value) % 7
    class_index = (i * 2 + seed_value) % 3
    
    route = routes[route_index]
    day = days[day_index]
    travel_class = classes[class_index]
    
    days_to_departure = 1 + ((i * 3 + seed_value) % 60)

    base = 120 + (days_to_departure * -1.5)
    route_adj = [140, 220, 60, 180, 80][route_index]
    class_adj = [0, 80, 220][class_index]
    noise = rng.normal(0, 25)

    price_usd = round(base + route_adj + class_adj + noise, 2)


    if i % 28 == 0:
        price_usd = ""
    if i % 45 == 0:
        price_usd = price_usd * -1 if price_usd != "" else ""
    if i % 37 == 0:
        travel_class = travel_class.upper()

    ticket = {
        "ticket_id": f"T{seed_value}-{i:04d}",
        "route": route,
        "day": day,
        "days_to_departure": days_to_departure,
        "class": travel_class,
        "price_usd": price_usd
    }

    tickets.append(ticket)

print("Total records:", len(tickets))
print("First 5 records:")
for t in tickets[:5]:
    print(t)

Total records: 320
First 5 records:
{'ticket_id': 'T402-0001', 'route': 'DXB-SIN', 'day': 'Fri', 'days_to_departure': 46, 'class': 'business', 'price_usd': 456.42}
{'ticket_id': 'T402-0002', 'route': 'MAD-ROM', 'day': 'Sat', 'days_to_departure': 49, 'class': 'premium', 'price_usd': 178.55}
{'ticket_id': 'T402-0003', 'route': 'NYC-LAX', 'day': 'Sun', 'days_to_departure': 52, 'class': 'economy', 'price_usd': 128.38}
{'ticket_id': 'T402-0004', 'route': 'LHR-JFK', 'day': 'Mon', 'days_to_departure': 55, 'class': 'business', 'price_usd': 491.13}
{'ticket_id': 'T402-0005', 'route': 'SFO-SEA', 'day': 'Tue', 'days_to_departure': 58, 'class': 'premium', 'price_usd': 117.89}


In [2]:
#TASK2

cleaned_tickets = []

for t in tickets:
    price = t["price_usd"]

    # checking price valid
    if isinstance(price, (int, float)) and price >= 0:
        cleaned_ticket = t.copy()
        cleaned_ticket["class"] = cleaned_ticket["class"].lower()
        cleaned_tickets.append(cleaned_ticket)

print("Cleaned records:", len(cleaned_tickets))

# validation
invalid_found = any(
    not isinstance(t["price_usd"], (int, float)) or t["price_usd"] < 0
    for t in cleaned_tickets
)

print("Invalid prices remaining:", invalid_found)

print("Two cleaned records:")
print(cleaned_tickets[:2])

Cleaned records: 302
Invalid prices remaining: False
Two cleaned records:
[{'ticket_id': 'T402-0001', 'route': 'DXB-SIN', 'day': 'Fri', 'days_to_departure': 46, 'class': 'business', 'price_usd': 456.42}, {'ticket_id': 'T402-0002', 'route': 'MAD-ROM', 'day': 'Sat', 'days_to_departure': 49, 'class': 'premium', 'price_usd': 178.55}]


In [3]:
#TASK 3
prices = np.array([t["price_usd"] for t in cleaned_tickets])
days_array = np.array([t["day"] for t in cleaned_tickets])

mean_price = prices.mean()
std_price = prices.std()

print("Mean price:", mean_price)
print("Std price:", std_price)

unique_days, indices = np.unique(days_array, return_inverse=True)

daily_totals = np.zeros(len(unique_days))
ticket_counts = np.zeros(len(unique_days))

np.add.at(daily_totals, indices, prices)
np.add.at(ticket_counts, indices, 1)

daily_totals_dict = dict(zip(unique_days, daily_totals))
ticket_counts_dict = dict(zip(unique_days, ticket_counts))

print("Daily totals:", daily_totals_dict)
print("Ticket counts per day:", ticket_counts_dict)

# Validation
print(
    "Validation:",
    np.isclose(daily_totals.sum(), prices.sum())
)

Mean price: 317.0479801324503
Std price: 113.45116516681622
Daily totals: {np.str_('Fri'): np.float64(14444.010000000002), np.str_('Mon'): np.float64(14336.5), np.str_('Sat'): np.float64(14179.800000000001), np.str_('Sun'): np.float64(14191.46), np.str_('Thu'): np.float64(10594.520000000002), np.str_('Tue'): np.float64(13971.24), np.str_('Wed'): np.float64(14030.959999999997)}
Ticket counts per day: {np.str_('Fri'): np.float64(45.0), np.str_('Mon'): np.float64(45.0), np.str_('Sat'): np.float64(45.0), np.str_('Sun'): np.float64(45.0), np.str_('Thu'): np.float64(33.0), np.str_('Tue'): np.float64(45.0), np.str_('Wed'): np.float64(44.0)}
Validation: True


In [4]:
# TASK 4:

threshold = np.percentile(prices, 90)

high_price_mask = prices >= threshold
high_price_count = high_price_mask.sum()

print("90th percentile threshold:", threshold)
print("High price ticket count:", high_price_count)

# validation
print("All selected >= threshold:", np.all(prices[high_price_mask] >= threshold))

90th percentile threshold: 478.49000000000007
High price ticket count: 31
All selected >= threshold: True


In [5]:
#TASK 5

report = {
    "total_tickets": len(tickets),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": float(mean_price),
    "std_price": float(std_price),
    "daily_totals": daily_totals_dict,
    "high_price_count": int(high_price_count)
}

print("\nFINAL REPORT")
for k, v in report.items():
    print(f"{k}: {v}")

print("\nValidation check: Daily totals equal overall revenue ->",
      np.isclose(sum(daily_totals_dict.values()), prices.sum()))


FINAL REPORT
total_tickets: 320
cleaned_tickets: 302
mean_price: 317.0479801324503
std_price: 113.45116516681622
daily_totals: {np.str_('Fri'): np.float64(14444.010000000002), np.str_('Mon'): np.float64(14336.5), np.str_('Sat'): np.float64(14179.800000000001), np.str_('Sun'): np.float64(14191.46), np.str_('Thu'): np.float64(10594.520000000002), np.str_('Tue'): np.float64(13971.24), np.str_('Wed'): np.float64(14030.959999999997)}
high_price_count: 31

Validation check: Daily totals equal overall revenue -> True
