Task 1: Generate the raw dataset using fixed rules

In [53]:
import numpy as np

In [54]:
seed_value = 2106
n = 320
rng = np.random.default_rng(seed_value)
routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
classes = ['economy', 'premium', 'business']

route_adjs = [140, 220, 60, 180, 80]
class_adjs = [0,80,220]

In [55]:
tickets = []

In [56]:
for i in range(1, n+1):
    ticket_id=f'T{seed_value}-{i:04d}'
    days_to_departure = 1 + ((i*3 + seed_value)%60)
    route_idx = (i+seed_value) % 5
    day_idx = (i+seed_value) % 7
    class_idx = (i*2+seed_value) % 3

    #Price
    base = 120+(days_to_departure*-1.5)
    noise = rng.normal(0,25)
    route_adj = route_adjs[route_idx]
    class_adj = class_adjs[class_idx]
    price_usd = round(base + +class_adj+ route_adj + noise, 2)

    current_class = classes[class_idx]

    #Inject deterministic data issues
    if i % 28 == 0:
        price_usd = ' '
    elif i % 45 ==0 :
        price_usd = price_usd * -1
    elif i % 37 == 0:
        current_class = current_class.upper()  


    #Generating ticket 
    tickets.append({
        'ticket_id' : ticket_id,
        'route' : routes[route_idx],
        'day' : days[day_idx],
        'days_to_departure' : days_to_departure,
        'class' : current_class, 
        'price_usd' : price_usd
    })

# Now these will work correctly
print(f'Total records generated: {len(tickets)}') 
print('First five rows: ')
for t in tickets[:5]:
    print(t)



Total records generated: 320
First five rows: 
{'ticket_id': 'T2106-0001', 'route': 'SFO-SEA', 'day': 'Mon', 'days_to_departure': 10, 'class': 'business', 'price_usd': 423.03}
{'ticket_id': 'T2106-0002', 'route': 'DXB-SIN', 'day': 'Tue', 'days_to_departure': 13, 'class': 'premium', 'price_usd': 332.6}
{'ticket_id': 'T2106-0003', 'route': 'MAD-ROM', 'day': 'Wed', 'days_to_departure': 16, 'class': 'economy', 'price_usd': 187.68}
{'ticket_id': 'T2106-0004', 'route': 'NYC-LAX', 'day': 'Thu', 'days_to_departure': 19, 'class': 'business', 'price_usd': 481.31}
{'ticket_id': 'T2106-0005', 'route': 'LHR-JFK', 'day': 'Fri', 'days_to_departure': 22, 'class': 'premium', 'price_usd': 394.23}


Task 2

In [57]:
cleaned_tickets = []
for t in tickets:
    price = t['price_usd']


    #Validation
    if price == "" or not isinstance(price, (int, float)) or price<0:
        continue
    clean_entry = t.copy()
    clean_entry['class'] = t['class'].lower()
    cleaned_tickets.append(clean_entry)

print(f'Cleaned records: {len(cleaned_tickets)}')

invalid_found = any(isinstance(t['price_usd'],str) or t['price_usd']<0 for t in cleaned_tickets)
print(f"Any invalid prices remaining?  {invalid_found}")
print('Sample cleaned records: ', cleaned_tickets[:2])

Cleaned records: 302
Any invalid prices remaining?  False
Sample cleaned records:  [{'ticket_id': 'T2106-0001', 'route': 'SFO-SEA', 'day': 'Mon', 'days_to_departure': 10, 'class': 'business', 'price_usd': 423.03}, {'ticket_id': 'T2106-0002', 'route': 'DXB-SIN', 'day': 'Tue', 'days_to_departure': 13, 'class': 'premium', 'price_usd': 332.6}]


Task 3

In [58]:
import numpy as np
prices = np.array([t['price_usd'] for t in cleaned_tickets], dtype=np.float64)
days = np.array([t['day'] for t in cleaned_tickets], dtype=str)

mean_price = np.mean(prices)
std_price = np.std(prices)


unique_days = ["Mon", 'Tue', 'Wed', 'Thu', 'Fri', 'Sat','Sun']

daily_totals = {
    day: prices[days == day].sum()
    for day in unique_days
}

daily_counts = {
    day: (days == day).sum()
    for day in unique_days
}

print("Daily totals:", daily_totals)

# Validation
total_revenue = prices.sum()
sum_daily = sum(daily_totals.values())

print("Validation (totals match):", np.isclose(total_revenue, sum_daily))

Daily totals: {'Mon': np.float64(14023.299999999997), 'Tue': np.float64(13882.910000000002), 'Wed': np.float64(14016.230000000001), 'Thu': np.float64(14413.56), 'Fri': np.float64(14096.65), 'Sat': np.float64(13724.85), 'Sun': np.float64(10268.189999999999)}
Validation (totals match): True


Task 4

In [59]:
threshold = np.percentile(prices, 90)
high_price_mask = prices >= threshold
high_price_count = np.sum(high_price_mask)


#Verification

all_above = np.all(prices[high_price_mask] >=threshold)
print(f'90th percentile threshold: {threshold:.2f}')
print(f'High-price count:  {high_price_count}')
print(f'Validation (All>=threshold): {all_above}')


90th percentile threshold: 480.95
High-price count:  31
Validation (All>=threshold): True


Task 5

In [60]:
final_report = {
    'total_tickets': len(tickets),
    'cleaned_tickets': len(cleaned_tickets),
    'mean_price': round(mean_price, 2),
    'std_price': round(std_price, 2),
    'daily_totals': daily_totals,
    'high_price_count': int(high_price_count)
}

print("--- AIRLINE PRICING ANALYSIS REPORT ---")

for key, value in final_report.items():
    if key == "daily_totals":
        print("\nDaily Totals:")
        for day, total in value.items():
            print(f"  {day}: ${total:,.2f}")
    else:
        print(f"{key.replace('_', ' ').title()}: {value}")

print("\n--- VALIDATION STATEMENT ---")
print(f"Data integrity confirmed: {len(tickets) - len(cleaned_tickets)} records were pruned due to invalid prices.")

--- AIRLINE PRICING ANALYSIS REPORT ---
Total Tickets: 320
Cleaned Tickets: 302
Mean Price: 312.67
Std Price: 115.82

Daily Totals:
  Mon: $14,023.30
  Tue: $13,882.91
  Wed: $14,016.23
  Thu: $14,413.56
  Fri: $14,096.65
  Sat: $13,724.85
  Sun: $10,268.19
High Price Count: 31

--- VALIDATION STATEMENT ---
Data integrity confirmed: 18 records were pruned due to invalid prices.
