In [21]:
import numpy as np

In [23]:
seed_value = 1912

In [25]:
rng = np.random.default_rng(seed_value)

In [27]:
records = [] 
route = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
class_1 = ["economy", "premium", "business"]
for i in range(1, 321):
    ticket_id = f"T{seed_value}-{i:04d}"
    route_index = (i + seed_value) % 5
    day_ind = (i + seed_value) % 7
    class_ind = (i * 2 + seed_value) % 3
    days_to_departure = 1 + ((i * 3 + seed_value) % 60)
    base = 120 + (days_to_departure * -1.5)
    route_adj_list = [140, 220, 60, 180, 80]
    route_adj = route_adj_list[route_index]
    class_adj_list = [0, 80, 220]
    class_adj = class_adj_list[class_ind]
    noise = rng.normal(0, 25)
    price_usd = round(base + route_adj + class_adj + noise, 2)
    if i % 28 == 0:
        price_usd = ""
    if i % 45 == 0 and isinstance(price_usd, (int, float)):
        price_usd *= -1
    ticket_class = class_1[class_ind]
    if i % 37 == 0:
        ticket_class = ticket_class.upper()
    records.append({
    "ticket_id": ticket_id,
    "route": route[route_index],
    "day": day[day_ind],
    "days_to_departure": days_to_departure,
    "class": ticket_class,
    "price_usd": price_usd
})

        

In [29]:
records[:5]

[{'ticket_id': 'T1912-0001',
  'route': 'DXB-SIN',
  'day': 'Wed',
  'days_to_departure': 56,
  'class': 'economy',
  'price_usd': 213.47},
 {'ticket_id': 'T1912-0002',
  'route': 'MAD-ROM',
  'day': 'Thu',
  'days_to_departure': 59,
  'class': 'business',
  'price_usd': 365.29},
 {'ticket_id': 'T1912-0003',
  'route': 'NYC-LAX',
  'day': 'Fri',
  'days_to_departure': 2,
  'class': 'premium',
  'price_usd': 364.11},
 {'ticket_id': 'T1912-0004',
  'route': 'LHR-JFK',
  'day': 'Sat',
  'days_to_departure': 5,
  'class': 'economy',
  'price_usd': 302.08},
 {'ticket_id': 'T1912-0005',
  'route': 'SFO-SEA',
  'day': 'Sun',
  'days_to_departure': 8,
  'class': 'business',
  'price_usd': 363.48}]

# Task 2

In [36]:
cleaned_tickets = []
for dct in records:
    price = dct['price_usd']
    if isinstance(price, (int, float)) and price > 0:
        cleaned = dct.copy()
        cleaned['class'] = cleaned['class'].lower()
        cleaned_tickets.append(cleaned)

        
        

In [38]:
cleaned_tickets[:2]

[{'ticket_id': 'T1912-0001',
  'route': 'DXB-SIN',
  'day': 'Wed',
  'days_to_departure': 56,
  'class': 'economy',
  'price_usd': 213.47},
 {'ticket_id': 'T1912-0002',
  'route': 'MAD-ROM',
  'day': 'Thu',
  'days_to_departure': 59,
  'class': 'business',
  'price_usd': 365.29}]

In [40]:
all(isinstance(t['price_usd'], (int, float)) and (t['price_usd']>0) for t in cleaned_tickets)

True

# Task 3

Create NumPy arrays for prices and days. Compute mean and standard deviation of prices. Compute total revenue per day and ticket counts per day using vectorized operations (no loops). Validate daily totals sum to overall total revenue.

In [61]:
prices = np.array([dct['price_usd'] for dct in cleaned_tickets])
days = np.array([dct['day'] for dct in cleaned_tickets])

In [53]:
prices.mean()

np.float64(313.88894039735095)

In [55]:
prices.std()

np.float64(114.30684184068981)

In [71]:
days

array(['Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu',
       'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat',
       'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon',
       'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu',
       'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Sat', 'Sun',
       'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Wed',
       'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri',
       'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun',
       'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Wed',
       'Thu', 'Fri', 'Sat', 'Sun', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat',
       'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon',
       'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Wed', 'Thu',
       'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat',
       'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon',
       'Tue', 'Wed',

In [97]:
mon_revenue = prices[days == 'Mon'].sum()

In [99]:
tue_rev = prices[days == 'Tue'].sum()

In [101]:
wed_rev = prices[days == 'Wed'].sum()

In [103]:
thu_rev = prices[days== 'Thu'].sum()

In [105]:
fri_rev = prices[days=='Fri'].sum()

In [107]:
sat_rev = prices[days == 'Sat'].sum()

In [109]:
sun_rev = prices[days == 'Sun'].sum()

In [111]:
total_rev = mon_revenue + tue_rev + wed_rev + thu_rev + fri_rev + sat_rev + sun_rev

In [115]:
mon_tickets = len(prices[days == 'Mon'])

In [119]:
tue_t = len(prices[days == 'Tue'])
wed_t = len(prices[days == 'Wed'])
thu_t = len(prices[days== 'Thu'])
fri_t = len(prices[days=='Fri'])
sat_t = len(prices[days == 'Sat'])
sun_t = len(prices[days == 'Sun'])

In [121]:
print(
    f"Monday report: Ticket count: {mon_tickets}\nRevenue: {mon_revenue:.2f}\n\n"
    f"Tuesday report: Ticket count: {tue_t}\nRevenue: {tue_rev:.2f}\n\n"
    f"Wednesday report: Ticket count: {wed_t}\nRevenue: {wed_rev:.2f}\n\n"
    f"Thursday report: Ticket count: {thu_t}\nRevenue: {thu_rev:.2f}\n\n"
    f"Friday report: Ticket count: {fri_t}\nRevenue: {fri_rev:.2f}\n\n"
    f"Saturday report: Ticket count: {sat_t}\nRevenue: {sat_rev:.2f}\n\n"
    f"Sunday report: Ticket count: {sun_t}\nRevenue: {sun_rev:.2f}"
)

Monday report: Ticket count: 44
Revenue: 13923.86

Tuesday report: Ticket count: 33
Revenue: 10287.89

Wednesday report: Ticket count: 45
Revenue: 13888.13

Thursday report: Ticket count: 45
Revenue: 14133.56

Friday report: Ticket count: 45
Revenue: 14040.23

Saturday report: Ticket count: 45
Revenue: 14488.73

Sunday report: Ticket count: 45
Revenue: 14032.06


# Task 4
Define high-price tickets as above the 90th percentile of prices. Compute threshold and count. Verify all selected prices are >= threshold.

In [133]:
threshold = np.percentile(prices, 90)

In [153]:
f'{threshold:.2f}'

'472.18'

In [155]:
high_90 = prices[prices>=threshold]

In [162]:
len(high_90)

31

In [159]:
all(high_90 >= threshold)

True

# Task 5

In [168]:
weekdays = set(days)
daily_totals = {d: prices[days==d].sum() for d in weekdays}

In [170]:
report = {
    "total_tickets": len(records),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": prices.mean(),
    "std_price": prices.std(),
    "daily_totals": daily_totals,
    "high_price_count": len(high_90)
}

In [176]:
print("FINAL REPORT\n" + "="*50)
print(f"Total tickets: {report['total_tickets']}")
print(f"Cleaned tickets: {report['cleaned_tickets']}")
print(f"Mean price: {report['mean_price']:.2f}")
print(f"Standard deviation: {report['std_price']:.2f}")
print("\nDaily totals:")
for day, total in report['daily_totals'].items():
    print(f"  {day}: ${total:.2f}")
print(f"\nNumber of high-price tickets (>= 90th percentile): {report['high_price_count']}")

FINAL REPORT
Total tickets: 320
Cleaned tickets: 302
Mean price: 313.89
Standard deviation: 114.31

Daily totals:
  Sun: $14032.06
  Thu: $14133.56
  Tue: $10287.89
  Mon: $13923.86
  Wed: $13888.13
  Sat: $14488.73
  Fri: $14040.23

Number of high-price tickets (>= 90th percentile): 31
