<h1>TASK-1</h1>

Task 1: Generate the raw dataset using fixed rules

In [1]:
import numpy as np

seed_value = int("0704")

n = 320

rng = np.random.default_rng(seed_value)


tickets = []
for i in range(1,n+1):
    c_dict={}
    c_dict["ticket_id"] = f"T{seed_value}-{i:04d}"
    c_dict["route"] = ["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"][(i + seed_value) % 5 ]
    c_dict["day"] = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] [(i + seed_value) % 7]
    c_dict["days_to_departure"]= 1 + ((i * 3 + seed_value) % 60)
    c_dict["class"]  = ["economy", "premium", "business"][(i * 2 + seed_value) % 3]
    base = 120 + (c_dict["days_to_departure"] * -1.5)
    route_adj = [140, 220, 60, 180, 80][["NYC-LAX", "LHR-JFK", "SFO-SEA", "DXB-SIN", "MAD-ROM"].index(c_dict["route"])]
    class_adj = [0, 80, 220][["economy", "premium", "business"].index(c_dict["class"])]
    noise = rng.normal(0, 25)
    c_dict["price_usd"] = round(base + route_adj + class_adj + noise, 2)

    if i % 28 == 0:
        c_dict["price_usd"] = ""
    if i % 45 == 0:
        if c_dict["price_usd"] != "":
            c_dict["price_usd"] = c_dict["price_usd"] * -1
    if i % 37 == 0:
        c_dict["class"]=c_dict["class"].upper()
    tickets.append(c_dict)

print(len(tickets))

print(tickets[0:5])

320
[{'ticket_id': 'T704-0001', 'route': 'NYC-LAX', 'day': 'Sat', 'days_to_departure': 48, 'class': 'premium', 'price_usd': 265.12}, {'ticket_id': 'T704-0002', 'route': 'LHR-JFK', 'day': 'Sun', 'days_to_departure': 51, 'class': 'economy', 'price_usd': 239.99}, {'ticket_id': 'T704-0003', 'route': 'SFO-SEA', 'day': 'Mon', 'days_to_departure': 54, 'class': 'business', 'price_usd': 328.17}, {'ticket_id': 'T704-0004', 'route': 'DXB-SIN', 'day': 'Tue', 'days_to_departure': 57, 'class': 'premium', 'price_usd': 350.05}, {'ticket_id': 'T704-0005', 'route': 'MAD-ROM', 'day': 'Wed', 'days_to_departure': 60, 'class': 'economy', 'price_usd': 98.1}]


<h1>TASK-2</h1>

In [2]:
cleaned_tickets = tickets.copy()
c_cleaned_tickets  = cleaned_tickets.copy()
for ticket in c_cleaned_tickets:
    original_ticket = tickets[c_cleaned_tickets.index(ticket)]
    original_ticket["class"] = original_ticket["class"].lower()
    if ticket["price_usd"] == None or (type(ticket["price_usd"]) != float and type(ticket["price_usd"]) != int) or ticket["price_usd"] < 0:
        cleaned_tickets.remove(ticket)

len(cleaned_tickets)

cleaned_tickets[:2]

[{'ticket_id': 'T704-0001',
  'route': 'NYC-LAX',
  'day': 'Sat',
  'days_to_departure': 48,
  'class': 'premium',
  'price_usd': 265.12},
 {'ticket_id': 'T704-0002',
  'route': 'LHR-JFK',
  'day': 'Sun',
  'days_to_departure': 51,
  'class': 'economy',
  'price_usd': 239.99}]

<h1>TASK-3</h1>

In [3]:
prices = np.array([ticket["price_usd"] for ticket in cleaned_tickets])
days= np.array([ticket["day"] for ticket in cleaned_tickets])

print(prices.mean())
print(prices.std())



unique_days, inverse = np.unique(days, return_inverse=True)

totals = np.zeros(len(unique_days))
np.add.at(totals, inverse, prices)

print(unique_days)
print(totals)


totals = dict(zip(unique_days, totals))

def day_count(day):
    mask = days == day
    count = int(mask.sum())
    return count

days_dict = {"Mon" : day_count("Mon"),"Tue" : day_count("Tue"),"Wed" : day_count("Wed"),"Thu" : day_count("Thu"),"Fri" : day_count("Fri"),"Sat" : day_count("Sat"),"Sun" : day_count("Sun")}
days_dict

308.26695364238407
114.5970244201458
['Fri' 'Mon' 'Sat' 'Sun' 'Thu' 'Tue' 'Wed']
[10185.4  14027.33 14115.08 13725.69 13712.9  13803.9  13526.32]


{'Mon': 45, 'Tue': 45, 'Wed': 45, 'Thu': 44, 'Fri': 33, 'Sat': 45, 'Sun': 45}

<h1>TASK-4</h1>

In [4]:

threshold = np.percentile(prices, 90) 

print("90th percentile threshold:", threshold)

high_prices = prices[prices >= threshold]

count = high_prices.size
print("Count of high-price tickets:", count)


verification = np.all(high_prices >= threshold)

print("Verification:", verification)

90th percentile threshold: 470.82000000000005
Count of high-price tickets: 31
Verification: True


<h1>TASK-5</h1>

In [5]:
report = {
    "total_tickets": len(tickets),
    "cleaned_tickets": len(cleaned_tickets),
    "mean_price": prices.mean(),
    "std_price": prices.std(),
    "daily_totals": totals,
    "high_price_count": count
}

assert report["cleaned_tickets"] <= report["total_tickets"], \
    "Validation failed: cleaned tickets cannot exceed total tickets"

assert isinstance(report["daily_totals"], dict), \
    "Validation failed: daily_totals must be a dictionary"


print("=== TICKET SALES REPORT ===")
print(f"Total Tickets: {report['total_tickets']}")
print(f"Cleaned Tickets: {report['cleaned_tickets']}")
print(f"Mean Price: {report['mean_price']:.2f}")
print(f"Price Standard Deviation: {report['std_price']:.2f}")
print(f"High Price Ticket Count: {report['high_price_count']}")
print("\nDaily Totals:")

for day, total in report["daily_totals"].items():
    print(f"  {day}: {total:.2f}")

print("\nValidation passed: cleaned_tickets ≤ total_tickets")

=== TICKET SALES REPORT ===
Total Tickets: 320
Cleaned Tickets: 302
Mean Price: 308.27
Price Standard Deviation: 114.60
High Price Ticket Count: 31

Daily Totals:
  Fri: 10185.40
  Mon: 14027.33
  Sat: 14115.08
  Sun: 13725.69
  Thu: 13712.90
  Tue: 13803.90
  Wed: 13526.32

Validation passed: cleaned_tickets ≤ total_tickets
