In [22]:
import numpy as np
from pprint import pprint

n = 320

seed_value = 2111

rng = np.random.default_rng(seed = seed_value)

route = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
class_ = ["economy", "premium", "business"]

tickets = []

for i in range(1, n + 1):
    route_index = (i + seed_value) % 5
    class_index = (i * 2 + seed_value) % 3
    days_to_departure = 1 + ((i * 3 + seed_value) % 60)

    route_adj_list = [140, 220, 60, 180, 80]
    class_adj_list = [0, 80, 220]
    
    base = 120 + (days_to_departure * -1.5)
    route_adj = route_adj_list[route_index]
    class_adj = class_adj_list[class_index]
    noise = rng.normal(0, 25)

    price_usd = round(base + route_adj + class_adj + noise, 2)
    ticket_class = class_[class_index]

    if i % 28 == 0:
        price_usd = ""
    if i % 45 == 0 and price_usd != "":
        price_usd *= -1
    if i % 37 == 0:
        ticket_class = class_[class_index].upper()

    ticket = {
        "ticket_id": f"T{seed_value}-{i:04d}",
        "route": route[route_index],
        "day": day[(i + seed_value) % 7],
        "days_to_departure": days_to_departure,
        "class": class_[class_index],
        "price_usd": price_usd,
    }

    tickets.append(ticket)

print(len(tickets))
pprint(tickets[:5])
    

320
[{'class': 'premium',
  'day': 'Sat',
  'days_to_departure': 15,
  'price_usd': 250.2,
  'route': 'SFO-SEA',
  'ticket_id': 'T2111-0001'},
 {'class': 'economy',
  'day': 'Sun',
  'days_to_departure': 18,
  'price_usd': 261.69,
  'route': 'DXB-SIN',
  'ticket_id': 'T2111-0002'},
 {'class': 'business',
  'day': 'Mon',
  'days_to_departure': 21,
  'price_usd': 437.65,
  'route': 'MAD-ROM',
  'ticket_id': 'T2111-0003'},
 {'class': 'premium',
  'day': 'Tue',
  'days_to_departure': 24,
  'price_usd': 290.32,
  'route': 'NYC-LAX',
  'ticket_id': 'T2111-0004'},
 {'class': 'economy',
  'day': 'Wed',
  'days_to_departure': 27,
  'price_usd': 273.13,
  'route': 'LHR-JFK',
  'ticket_id': 'T2111-0005'}]


In [26]:
cleaned_tickets = []

for t in tickets:
    price = t["price_usd"]
    
    if price == "" or price < 0:
        continue
    
    t["class"] = t["class"].lower()
    
    cleaned_tickets.append(t)

print(len(tickets))
print(len(cleaned_tickets))
pprint(cleaned_tickets[:2])

320
302
[{'class': 'premium',
  'day': 'Sat',
  'days_to_departure': 15,
  'price_usd': 250.2,
  'route': 'SFO-SEA',
  'ticket_id': 'T2111-0001'},
 {'class': 'economy',
  'day': 'Sun',
  'days_to_departure': 18,
  'price_usd': 261.69,
  'route': 'DXB-SIN',
  'ticket_id': 'T2111-0002'}]


In [31]:
prices = []
days = []
for ticket in cleaned_tickets:
    prices.append(ticket["price_usd"])
    days.append(ticket["day"])

prices_array = np.array(prices)
mean_price = prices_array.mean()
std_price = prices_array.std()

print(mean_price)
print(std_price)

total_revenue_per_day = []
ticket_count_per_day = []

for d in day:
    total = 0
    count = 0
    for i in range(len(days)):
        if days[i] == d:
            total += prices[i]
            count += 1
    total_revenue_per_day.append(total)
    ticket_count_per_day.append(count)

print(total_revenue_per_day)
print(ticket_count_per_day)

302.68387417218537
112.05498016018474
[13925.02, 13613.869999999997, 13296.73, 13332.08, 10136.510000000002, 13427.85, 13678.470000000003]
[45, 45, 45, 44, 33, 45, 45]


In [35]:
threshold = np.percentile(prices, 0.9)
high_price_tickets = []
for t in cleaned_tickets:
    if t["price_usd"] >= threshold:
        high_price_tickets.append(t)

count_high_price = len(high_price_tickets)

pprint(high_price_tickets)
print(count_high_price)

[{'class': 'premium',
  'day': 'Sat',
  'days_to_departure': 15,
  'price_usd': 250.2,
  'route': 'SFO-SEA',
  'ticket_id': 'T2111-0001'},
 {'class': 'economy',
  'day': 'Sun',
  'days_to_departure': 18,
  'price_usd': 261.69,
  'route': 'DXB-SIN',
  'ticket_id': 'T2111-0002'},
 {'class': 'business',
  'day': 'Mon',
  'days_to_departure': 21,
  'price_usd': 437.65,
  'route': 'MAD-ROM',
  'ticket_id': 'T2111-0003'},
 {'class': 'premium',
  'day': 'Tue',
  'days_to_departure': 24,
  'price_usd': 290.32,
  'route': 'NYC-LAX',
  'ticket_id': 'T2111-0004'},
 {'class': 'economy',
  'day': 'Wed',
  'days_to_departure': 27,
  'price_usd': 273.13,
  'route': 'LHR-JFK',
  'ticket_id': 'T2111-0005'},
 {'class': 'business',
  'day': 'Thu',
  'days_to_departure': 30,
  'price_usd': 350.43,
  'route': 'SFO-SEA',
  'ticket_id': 'T2111-0006'},
 {'class': 'premium',
  'day': 'Fri',
  'days_to_departure': 33,
  'price_usd': 325.49,
  'route': 'DXB-SIN',
  'ticket_id': 'T2111-0007'},
 {'class': 'economy

In [40]:
report_dict = {
    "total_tickets" : (len(tickets)),
    "cleaned_tickets" : cleaned_tickets,
    "mean_price" : mean_price,
    "std_price" : std_price,
    "daily_totals" : ticket_count_per_day,
    "high_price_count" : (len(high_price_tickets))
}

pprint(report_dict)

{'cleaned_tickets': [{'class': 'premium',
                      'day': 'Sat',
                      'days_to_departure': 15,
                      'price_usd': 250.2,
                      'route': 'SFO-SEA',
                      'ticket_id': 'T2111-0001'},
                     {'class': 'economy',
                      'day': 'Sun',
                      'days_to_departure': 18,
                      'price_usd': 261.69,
                      'route': 'DXB-SIN',
                      'ticket_id': 'T2111-0002'},
                     {'class': 'business',
                      'day': 'Mon',
                      'days_to_departure': 21,
                      'price_usd': 437.65,
                      'route': 'MAD-ROM',
                      'ticket_id': 'T2111-0003'},
                     {'class': 'premium',
                      'day': 'Tue',
                      'days_to_departure': 24,
                      'price_usd': 290.32,
                      'route': 'NYC-LAX',
          