task 1

In [1]:
import numpy as np

seed_value = 111
n = 320
rng = np.random.default_rng(seed_value)

routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
classes = ["economy", "premium", "business"]
route_adjs = [140, 220, 60, 180, 80]
class_adjs = [0, 80, 220]

tickets = []

for i in range(1, n + 1):
    route_idx = (i + seed_value) % 5
    day_idx = (i + seed_value) % 7
    class_idx = (i * 2 + seed_value) % 3
    days_to_departure = 1 + ((i * 3 + seed_value) % 60)
    
    base = 120 + (days_to_departure * -1.5)
    noise = rng.normal(0, 25)
    route_adj = route_adjs[route_idx]
    class_adj = class_adjs[class_idx]
    price_usd = round(base + route_adj + class_adj + noise, 2)

    current_class = classes[class_idx]
    
    if i % 28 == 0:
        price_usd = ""
    if i % 45 == 0:
        price_usd *= -1 
    if i % 37 == 0:
        current_class = current_class.upper()

    tickets.append({
        "ticket_id": f"T{seed_value}-{i:04d}",
        "route": routes[route_idx],
        "day": days[day_idx],
        "days_to_departure": days_to_departure,
        "class": current_class,
        "price_usd": price_usd
    })

print("Total record count: ", len(tickets))
print("First five records: ", tickets[:5])

Total record count:  320
First five records:  [{'ticket_id': 'T111-0001', 'route': 'SFO-SEA', 'day': 'Mon', 'days_to_departure': 55, 'class': 'business', 'price_usd': 309.81}, {'ticket_id': 'T111-0002', 'route': 'DXB-SIN', 'day': 'Tue', 'days_to_departure': 58, 'class': 'premium', 'price_usd': 272.03}, {'ticket_id': 'T111-0003', 'route': 'MAD-ROM', 'day': 'Wed', 'days_to_departure': 1, 'class': 'economy', 'price_usd': 201.64}, {'ticket_id': 'T111-0004', 'route': 'NYC-LAX', 'day': 'Thu', 'days_to_departure': 4, 'class': 'business', 'price_usd': 457.48}, {'ticket_id': 'T111-0005', 'route': 'LHR-JFK', 'day': 'Fri', 'days_to_departure': 7, 'class': 'premium', 'price_usd': 413.46}]


task 2

In [2]:
cleaned_tickets=[]

for t in tickets:
    price = t["price_usd"]
    
    if isinstance(price, (int, float)) and price >= 0:
        t_clean = t.copy()
        t_clean["class"] = t_clean["class"].lower()
        cleaned_tickets.append(t_clean)

print("Original records: :", len(tickets))
print("Cleaned records count: : ",len(cleaned_tickets)) 

invalid_remaining = []

for t in cleaned_tickets:
    if not isinstance(t["price_usd"], (int, float)) or t["price_usd"] < 0:
        invalid_remaining.append(t)

print("Invalid prices remaining: ",len(invalid_remaining))
print("Two cleaned records:",cleaned_tickets[:2])



Original records: : 320
Cleaned records count: :  302
Invalid prices remaining:  0
Two cleaned records: [{'ticket_id': 'T111-0001', 'route': 'SFO-SEA', 'day': 'Mon', 'days_to_departure': 55, 'class': 'business', 'price_usd': 309.81}, {'ticket_id': 'T111-0002', 'route': 'DXB-SIN', 'day': 'Tue', 'days_to_departure': 58, 'class': 'premium', 'price_usd': 272.03}]


task 3

In [3]:
prices = np.array([t["price_usd"] for t in cleaned_tickets])
days_arr = np.array([t["day"] for t in cleaned_tickets])

mean_price = np.mean(prices)
std_price = np.std(prices)

print("Mean Price: ",mean_price)
print("Standard Deviation: ", std_price)

unique_days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

daily_totals = {d: np.sum(prices[days_arr == d]) for d in unique_days}
daily_counts = {d: np.sum(days_arr == d) for d in unique_days}

total_revenue = np.sum(prices)
sum_of_daily = sum(daily_totals.values())

print("Total Revenue: ",total_revenue)
print("Sum of Daily Totals: ",sum_of_daily)
print("Validation: ",np.isclose(total_revenue, sum_of_daily))

print("Ticket counts per day: ",daily_counts)

Mean Price:  313.2776490066225
Standard Deviation:  114.9605664442728
Total Revenue:  94609.84999999999
Sum of Daily Totals:  94609.85
Validation:  True
Ticket counts per day:  {'Mon': np.int64(45), 'Tue': np.int64(45), 'Wed': np.int64(45), 'Thu': np.int64(45), 'Fri': np.int64(45), 'Sat': np.int64(44), 'Sun': np.int64(33)}


task 4

In [4]:
threshold=np.quantile(prices,0.9)

high_value_tickets = prices[prices >= threshold]
count = len(high_value_tickets)
verification = np.all(high_value_tickets >= threshold)

print("90th percentile: ",threshold)
print("high value tickets: ",count)
print("verification of all selected prices are higher than threshold: ",verification)

90th percentile:  471.238
high value tickets:  31
verification of all selected prices are higher than threshold:  True


task 5

In [5]:
report={
    'total_tickets':len(tickets),
    'cleaned_tickets':len(cleaned_tickets),
    'mean_price':mean_price,
    'std_price':std_price,
    'daily_totals':daily_totals,
    'high_price_count':count
}

print(f"""
Total Records Processed:     {report['total_tickets']}
Cleaned Records:             {report['cleaned_tickets']}
Average Ticket Price:        {report['mean_price']}
Price Standard Deviation:    {report['std_price']}
High-Value Tickets:          {report['high_price_count']}
Daily Revenue Totals:        {report['daily_totals']}
""")

total_rev = np.sum(prices)
daily_sum = sum(report['daily_totals'].values())

print("Total Revenue: ", total_rev)
print("Sum of Daily: ", daily_sum)

if np.isclose(total_rev, daily_sum):
    print("validation succesfull")
else:
    print("validation is not successfull")



Total Records Processed:     320
Cleaned Records:             302
Average Ticket Price:        313.2776490066225
Price Standard Deviation:    114.9605664442728
High-Value Tickets:          31
Daily Revenue Totals:        {'Mon': np.float64(13991.650000000003), 'Tue': np.float64(14202.15), 'Wed': np.float64(13862.350000000002), 'Thu': np.float64(14447.59), 'Fri': np.float64(14165.060000000001), 'Sat': np.float64(13601.5), 'Sun': np.float64(10339.550000000001)}

Total Revenue:  94609.84999999999
Sum of Daily:  94609.85
validation succesfull
