In [41]:
import numpy as np


# Task 1

In [42]:
n = 320
seed_value = 2702

rng = np.random.default_rng(seed_value)

In [43]:
def generateRawDataSet(n):
    raw_data = []
    route_categories = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
    days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    class_categories = ["economy", "premium" , "business"]
    for i in range(1,n+1):
        ticket_id = f"T{seed_value}-{i:04d}"
        route = route_categories[(i + seed_value) % 5]
        day = days[(i + seed_value) % 7]
        days_to_departure = 1 + ((i * 3 + seed_value) % 60)
        class_d = class_categories[(i * 2 + seed_value) % 3]
        base = 120 + (days_to_departure * -1.5)
        route_adj = [140, 220, 60, 180, 80]
        class_adj = [0, 80, 220]
        noise = rng.normal(0, 25)
        price_usd = round(base + route_adj[(i + seed_value) % 5] + class_adj[(i * 2 + seed_value) % 3] + noise, 2)
        if i % 28 == 0: price_usd = ""
        if i % 45 == 0:   price_usd *= -1
        if i % 37 == 0: class_d = class_d.upper()
        record ={
            "ticket_id": ticket_id,
            "route" : route,
            "day": day,
            "days_to_departure": days_to_departure,
            "class": class_d,
            "price_usd": price_usd
        }
        raw_data.append(record)
    return raw_data

In [44]:
raw_data = generateRawDataSet(n)
len(raw_data), raw_data[:5]

(320,
 [{'ticket_id': 'T2702-0001',
   'route': 'DXB-SIN',
   'day': 'Tue',
   'days_to_departure': 6,
   'class': 'premium',
   'price_usd': 381.28},
  {'ticket_id': 'T2702-0002',
   'route': 'MAD-ROM',
   'day': 'Wed',
   'days_to_departure': 9,
   'class': 'economy',
   'price_usd': 195.71},
  {'ticket_id': 'T2702-0003',
   'route': 'NYC-LAX',
   'day': 'Thu',
   'days_to_departure': 12,
   'class': 'business',
   'price_usd': 412.59},
  {'ticket_id': 'T2702-0004',
   'route': 'LHR-JFK',
   'day': 'Fri',
   'days_to_departure': 15,
   'class': 'premium',
   'price_usd': 453.83},
  {'ticket_id': 'T2702-0005',
   'route': 'SFO-SEA',
   'day': 'Sat',
   'days_to_departure': 18,
   'class': 'economy',
   'price_usd': 209.47}])

# Task2

In [45]:
cleaned_tickets = []
for ticket in raw_data:
    price = ticket["price_usd"]

    if price == "" or not isinstance(price,(float,int)):
        continue
    if price < 0:
        continue
    normalized_class = ticket["class"].lower()

    clean_ticket = ticket.copy()
    clean_ticket["class"] = normalized_class
    clean_ticket["price_usd"] = price
    cleaned_tickets.append(clean_ticket)

len(cleaned_tickets) , cleaned_tickets[:2]

(302,
 [{'ticket_id': 'T2702-0001',
   'route': 'DXB-SIN',
   'day': 'Tue',
   'days_to_departure': 6,
   'class': 'premium',
   'price_usd': 381.28},
  {'ticket_id': 'T2702-0002',
   'route': 'MAD-ROM',
   'day': 'Wed',
   'days_to_departure': 9,
   'class': 'economy',
   'price_usd': 195.71}])

# Task 3

In [46]:
prices = np.array([ticket["price_usd"] for ticket in cleaned_tickets],dtype=np.float64)
days = np.array([ticket["day"] for ticket in cleaned_tickets],dtype=str)

day_sequence = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

prices_mean = np.mean(prices)
prices_std = np.std(prices)

total_count = np.array([np.sum(days == d) for d in day_sequence])
daily_revenue = np.array([np.sum(prices[days == d]) for d in day_sequence])


In [47]:
overall_revenue = np.sum(prices)
sum_of_daily_revenues = np.sum(daily_revenue)

is_valid = np.isclose(overall_revenue, sum_of_daily_revenues)
is_valid

np.True_

# Task4

In [48]:
threshold = np.percentile(prices, 90)
mask = prices>= threshold
price_outliers = prices[mask]
len(price_outliers), price_outliers[:2]

(31, array([505.86, 489.24]))

# Task5

In [49]:
report = {
    "total_tickets" : len(raw_data),
    "cleaned_tickets" : len(cleaned_tickets),
    "mean_price" : np.mean(prices),
    "std_price" : np.std(prices),
    "daily_totals" : {day: rev for day, rev in zip(day_sequence,daily_revenue,)},
    "high_price_counts" : len(price_outliers)
}

daily_breakdown = ", ".join([f"{day}: ${rev:,.2f}" for day, rev in report["daily_totals"].items()])

report = f"""
   "Validation": {is_valid} ,
   "Total Tickets": {report["total_tickets"]} ,
   "Cleaned Tickets": {report["cleaned_tickets"]} ,
   "Mean_Price": {report["mean_price"]} ,
   "Std_Price": {report["std_price"]} ,
   "Daily Total Revenue": {{ {daily_breakdown} }},
   "High_price_count" : {report["high_price_counts"]}
"""
print(report)


   "Validation": True ,
   "Total Tickets": 320 ,
   "Cleaned Tickets": 302 ,
   "Mean_Price": 305.13129139072845 ,
   "Std_Price": 114.46846424500704 ,
   "Daily Total Revenue": { Mon: $10,100.96, Tue: $14,083.72, Wed: $13,301.32, Thu: $13,579.83, Fri: $13,765.79, Sat: $13,673.11, Sun: $13,644.92 },
   "High_price_count" : 31

