# Task 1: Generate the raw dataset using fixed rules

In [1]:
import numpy as np

In [2]:
seed_value = 2712
n = 320
rng = np.random.default_rng(seed_value)

In [3]:
routes = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"]
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
classes = ["economy", "premium", "business"]
route_adjs = [140, 220, 60, 180, 80]
class_adjs = [0, 80, 220]

In [4]:
tickets = []

for i in range(1, n+1):
    ticket_id = f"T{seed_value}-{i:04d}" 
    
    route_idx = (i + seed_value) % 5
    route = routes[route_idx]

    day_idx = (i + seed_value) % 7
    day = days[day_idx]

    days_to_departure = 1 + ((i * 3 + seed_value) % 60)

    class_idx = (i * 2 + seed_value) % 3
    ticket_class = classes[class_idx]

    base = 120 + (days_to_departure * -1.5)
    route_adj = route_adjs[route_idx]
    class_adj = class_adjs[class_idx]
    noise = rng.normal(0, 25)
    price_usd = round(base + route_adj + class_adj + noise, 2)

    if i % 28 == 0: 
        price_usd = ""
    if i % 45 == 0 and isinstance(price_usd, (int, float)):
        price_usd *= -1
    if i % 37 == 0:
        ticket_class = ticket_class.upper()

    tickets.append({
        'ticket_id' : ticket_id,
        'route' : route,
        'day' : day,
        'days_to_departure' : days_to_departure,
        'class' : ticket_class,
        'price_usd' : price_usd
    })

print(f"Total record count: {len(tickets)}")
print(f"First 5 records:\n")
for i in tickets[:5]:
    print(i)

Total record count: 320
First 5 records:

{'ticket_id': 'T2712-0001', 'route': 'DXB-SIN', 'day': 'Fri', 'days_to_departure': 16, 'class': 'business', 'price_usd': 476.49}
{'ticket_id': 'T2712-0002', 'route': 'MAD-ROM', 'day': 'Sat', 'days_to_departure': 19, 'class': 'premium', 'price_usd': 279.79}
{'ticket_id': 'T2712-0003', 'route': 'NYC-LAX', 'day': 'Sun', 'days_to_departure': 22, 'class': 'economy', 'price_usd': 243.33}
{'ticket_id': 'T2712-0004', 'route': 'LHR-JFK', 'day': 'Mon', 'days_to_departure': 25, 'class': 'business', 'price_usd': 538.71}
{'ticket_id': 'T2712-0005', 'route': 'SFO-SEA', 'day': 'Tue', 'days_to_departure': 28, 'class': 'premium', 'price_usd': 251.56}


# Task 2: Validate and clean records with core Python

In [5]:
cleaned_tickets = []

for ticket in tickets:
    price = ticket['price_usd']

    is_numeric = isinstance(price, (int, float))
    is_positive = is_numeric and price >= 0
    
    if is_numeric and is_positive:
        clean_ticket = ticket.copy()
        clean_ticket['class'] = clean_ticket['class'].lower()
        cleaned_tickets.append(clean_ticket)

invalid_remaining = [t for t in cleaned_tickets if not isinstance(t['price_usd'], (int, float)) or t['price_usd'] < 0]

print(f"Cleaned record count: {len(cleaned_tickets)}")
print(f"Verification - Invalid prices remaining: {len(invalid_remaining)}")
print("Two cleaned records:\n")
for t in cleaned_tickets[:2]:
    print(t)

Cleaned record count: 302
Verification - Invalid prices remaining: 0
Two cleaned records:

{'ticket_id': 'T2712-0001', 'route': 'DXB-SIN', 'day': 'Fri', 'days_to_departure': 16, 'class': 'business', 'price_usd': 476.49}
{'ticket_id': 'T2712-0002', 'route': 'MAD-ROM', 'day': 'Sat', 'days_to_departure': 19, 'class': 'premium', 'price_usd': 279.79}


# Task 3: Convert to NumPy for analysis

In [6]:
prices = np.array([ticket['price_usd'] for ticket in cleaned_tickets])
days = np.array([ticket['day'] for ticket in cleaned_tickets])

mean_price = np.mean(prices)
std_price = np.std(prices)

unique_days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

daily_revenue = np.array([np.sum(prices[days == d])for d in unique_days])
daily_counts = np.array([np.sum(days == d) for d in unique_days])

total_revenue = np.sum(prices)
sum_of_daily_totals = np.sum(daily_revenue)
is_valid = np.isclose(total_revenue, sum_of_daily_totals)

print(f"Price Mean = {mean_price:.2f}, Std Dev = {std_price:.2f}")
for i, day in enumerate(unique_days):
    print(f"{day}: ${daily_revenue[i]:.2f} ({daily_counts[i]} tickets)")
print(f"Total Revenue: ${total_revenue:.2f}")
print(f"Validation: Sum of daily totals matches total? {is_valid}")

Price Mean = 316.26, Std Dev = 115.03
Mon: $14537.36 (45 tickets)
Tue: $13965.53 (45 tickets)
Wed: $13623.18 (44 tickets)
Thu: $10381.56 (33 tickets)
Fri: $14590.49 (45 tickets)
Sat: $14222.87 (45 tickets)
Sun: $14189.64 (45 tickets)
Total Revenue: $95510.63
Validation: Sum of daily totals matches total? True


# Task 4: Identify high-price tickets

In [7]:
threshold = np.percentile(prices, 90)

high_price_mask = prices >= threshold
high_price_tickets = prices[high_price_mask]

high_price_count = len(high_price_tickets)

verification = np.all(high_price_tickets >= threshold)

print(f"90th percentile threshold: ${threshold:.2f}")
print(f"Number of high-price tickets: {high_price_count}")
print(f"Verification: {verification}")

90th percentile threshold: $481.09
Number of high-price tickets: 31
Verification: True


# Task 5: Produce a final report

In [9]:
report = {
    "total_tickets": len(tickets),              
    "cleaned_tickets": len(cleaned_tickets),    
    "mean_price": round(mean_price, 2),
    "std_price": round(std_price, 2),
    "daily_totals": {day: round(rev, 2) for day, rev in zip(unique_days, daily_revenue)},
    "high_price_count": high_price_count
}

print(f"Total Records Processed: {report['total_tickets']}")
print(f"Cleaned Records:          {report['cleaned_tickets']}")
print(f"Average Ticket Price:    ${report['mean_price']}")
print(f"Price Standard Dev:      ${report['std_price']}")
print(f"High-Price Tickets (90th percentile): {report['high_price_count']}")
print("Daily Revenue Breakdown:")
for day, total in report['daily_totals'].items():
    print(f"  - {day}: ${total:,.2f}")

validation_msg = "SUCCESS" if report['cleaned_tickets'] < report['total_tickets'] else "WARNING"
print(f"VALIDATION: {validation_msg} - Data cleaning successfully filtered {report['total_tickets'] - report['cleaned_tickets']} invalid records.")
print(f"VALIDATION: SUCCESS - Daily revenue sum matches total revenue within floating-point tolerance.")

Total Records Processed: 320
Cleaned Records:          302
Average Ticket Price:    $316.26
Price Standard Dev:      $115.03
High-Price Tickets (90th percentile): 31
Daily Revenue Breakdown:
  - Mon: $14,537.36
  - Tue: $13,965.53
  - Wed: $13,623.18
  - Thu: $10,381.56
  - Fri: $14,590.49
  - Sat: $14,222.87
  - Sun: $14,189.64
VALIDATION: SUCCESS - Data cleaning successfully filtered 18 invalid records.
VALIDATION: SUCCESS - Daily revenue sum matches total revenue within floating-point tolerance.
