In [1]:
import pandas as pd

df = pd.read_csv("itineraries.csv", nrows=5)
df.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,...,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode
0,9ca0e81111c683bec1012473feefd28f,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H29M,0,False,False,...,1650223560,2022-04-17T15:26:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,8940,947,coach
1,98685953630e772a098941b71906592b,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H30M,0,False,False,...,1650200400,2022-04-17T09:00:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9000,947,coach
2,98d90cbc32bfbb05c2fc32897c7c1087,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H30M,0,False,False,...,1650218700,2022-04-17T14:05:00.000-04:00,BOS,ATL,Delta,DL,Boeing 757-200,9000,947,coach
3,969a269d38eae583f455486fa90877b4,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H32M,0,False,False,...,1650227460,2022-04-17T16:31:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9120,947,coach
4,980370cf27c89b40d2833a1d5afc9751,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H34M,0,False,False,...,1650213180,2022-04-17T12:33:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9240,947,coach


In [2]:
from collections import Counter

route_counts = Counter()

for chunk in pd.read_csv("itineraries.csv", usecols=["startingAirport", "destinationAirport"], chunksize=500_000):
    chunk = chunk.dropna(subset=["startingAirport", "destinationAirport"])
    
    routes = chunk["startingAirport"] + "-" + chunk["destinationAirport"]
    
    route_counts.update(routes)

route_df = pd.DataFrame(route_counts.most_common(20), columns=["Route", "Count"])
print(route_df)


      Route   Count
0   ATL-LAX  709809
1   LAX-BOS  679169
2   LGA-LAX  677713
3   LAX-ATL  669609
4   LAX-LGA  663659
5   BOS-LAX  644390
6   LAX-JFK  625496
7   LAX-ORD  620576
8   DFW-LAX  612390
9   LAX-DFW  610669
10  JFK-LAX  605017
11  LAX-DTW  601537
12  ORD-LAX  597847
13  LAX-EWR  587270
14  DTW-LAX  582022
15  CLT-LAX  572097
16  JFK-ORD  557152
17  LAX-CLT  554474
18  LGA-ORD  550319
19  LAX-PHL  549880


In [3]:
origin = "LAX"
destination = "JFK"

airline_counts = Counter()

for chunk in pd.read_csv("itineraries.csv", usecols=["startingAirport", "destinationAirport", "segmentsAirlineName"], chunksize=500_000):
    chunk = chunk.dropna(subset=["startingAirport", "destinationAirport", "segmentsAirlineName"])
    
    mask = (chunk["startingAirport"] == origin) & (chunk["destinationAirport"] == destination)
    airline_counts.update(chunk.loc[mask, "segmentsAirlineName"])

airline_df = pd.DataFrame(airline_counts.most_common(), columns=["Airline", "FlightCount"])
print(airline_df.head(10))


                                Airline  FlightCount
0  American Airlines||American Airlines       151007
1                       JetBlue Airways       113339
2                     American Airlines        93532
3      Alaska Airlines||Alaska Airlines        84522
4                                 Delta        71229
5                          Delta||Delta        41616
6      JetBlue Airways||JetBlue Airways        23101
7                                United        16705
8                        United||United        14394
9               United||Alaska Airlines         7268


In [4]:
import pandas as pd

# Clean airline names
airline_df["Airline"] = airline_df["Airline"].apply(lambda x: x.split("||")[0].strip())

# Re-sum counts for duplicate names
airline_df = airline_df.groupby("Airline", as_index=False)["FlightCount"].sum()

# Sort by count again
airline_df = airline_df.sort_values(by="FlightCount", ascending=False).reset_index(drop=True)

print(airline_df.head(10))


             Airline  FlightCount
0  American Airlines       246932
1    JetBlue Airways       136443
2              Delta       116042
3    Alaska Airlines        85958
4             United        40121


In [6]:
origin = "LAX"
destination = "JFK"
airline = "American Airlines"

chunks = []

for chunk in pd.read_csv("itineraries.csv", chunksize=500_000):
    mask = (
        (chunk["startingAirport"] == origin)
        & (chunk["destinationAirport"] == destination)
        & (chunk["segmentsAirlineName"].str.contains(airline, case=False, na=False))
    )
    filtered = chunk.loc[mask].copy()
    if not filtered.empty:
        chunks.append(filtered)

df = pd.concat(chunks)
df.to_csv("aa_lax_jfk_full.csv", index=False)
print("✅ Saved full filtered dataset with shape:", df.shape)

✅ Saved full filtered dataset with shape: (246932, 27)


In [7]:

df = pd.read_csv("aa_lax_jfk_full.csv")
print("Dataset shape:", df.shape)

Dataset shape: (246932, 27)


In [8]:
missing = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing / len(df)) * 100

missing_summary = pd.DataFrame({
    "Missing Values": missing,
    "Percent Missing": missing_percent.round(2)
})

print(missing_summary.head(15))

                     Missing Values  Percent Missing
totalTravelDistance            1432             0.58
segmentsDistance                358             0.14
legId                             0             0.00
startingAirport                   0             0.00
destinationAirport                0             0.00
fareBasisCode                     0             0.00
travelDuration                    0             0.00
elapsedDays                       0             0.00
isBasicEconomy                    0             0.00
searchDate                        0             0.00
flightDate                        0             0.00
isNonStop                         0             0.00
isRefundable                      0             0.00
totalFare                         0             0.00
baseFare                          0             0.00


In [9]:
df = df.dropna(subset=["totalTravelDistance", "segmentsDistance"])

In [10]:
df = df.drop_duplicates()

df = df[df["flightDate"] >= df["searchDate"]]

df.to_csv("aa_lax_jfk_clean.csv", index=False)
print("✅ Cleaned dataset saved:", df.shape)

✅ Cleaned dataset saved: (245500, 27)


In [12]:
df.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,...,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode
0,ef15f068409b3ac81252397357f64341,2022-04-16,2022-04-17,LAX,JFK,M0AHZNN1,PT5H29M,1,False,False,...,1650256140,2022-04-18T00:29:00.000-04:00,JFK,LAX,American Airlines,AA,AIRBUS INDUSTRIE A321 SHARKLETS,19740,2458,coach
1,37ba57ecb1ff9dda44cc8ecb7fa4c78f,2022-04-16,2022-04-17,LAX,JFK,M0AHZNN1,PT5H29M,1,False,False,...,1650284880,2022-04-18T08:28:00.000-04:00,JFK,LAX,American Airlines,AA,AIRBUS INDUSTRIE A321 SHARKLETS,19740,2458,coach
2,f6f6607cd706c732459d2853ab445aee,2022-04-16,2022-04-17,LAX,JFK,M0AHZNN1,PT5H30M,0,False,False,...,1650238200,2022-04-17T19:30:00.000-04:00,JFK,LAX,American Airlines,AA,AIRBUS INDUSTRIE A321 SHARKLETS,19800,2458,coach
3,c284f0239f46fd2afd6d72a33c2737f1,2022-04-16,2022-04-17,LAX,JFK,M0AHZNN1,PT5H30M,0,False,False,...,1650241800,2022-04-17T20:30:00.000-04:00,JFK,LAX,American Airlines,AA,AIRBUS INDUSTRIE A321 SHARKLETS,19800,2458,coach
4,7558638d8c2c18d062c73bc7abb06a00,2022-04-16,2022-04-17,LAX,JFK,M0AHZNN1,PT5H30M,0,False,False,...,1650225600,2022-04-17T16:00:00.000-04:00,JFK,LAX,American Airlines,AA,AIRBUS INDUSTRIE A321 SHARKLETS,19800,2458,coach


In [13]:
cols = pd.read_csv("itineraries.csv", nrows=0).columns.tolist()
print("Number of columns:", len(cols))
print(cols)

Number of columns: 27
['legId', 'searchDate', 'flightDate', 'startingAirport', 'destinationAirport', 'fareBasisCode', 'travelDuration', 'elapsedDays', 'isBasicEconomy', 'isRefundable', 'isNonStop', 'baseFare', 'totalFare', 'seatsRemaining', 'totalTravelDistance', 'segmentsDepartureTimeEpochSeconds', 'segmentsDepartureTimeRaw', 'segmentsArrivalTimeEpochSeconds', 'segmentsArrivalTimeRaw', 'segmentsArrivalAirportCode', 'segmentsDepartureAirportCode', 'segmentsAirlineName', 'segmentsAirlineCode', 'segmentsEquipmentDescription', 'segmentsDurationInSeconds', 'segmentsDistance', 'segmentsCabinCode']


In [14]:
df[["baseFare", "totalFare"]].head()


Unnamed: 0,baseFare,totalFare
0,327.44,366.6
1,327.44,366.6
2,327.44,366.6
3,327.44,366.6
4,327.44,366.6
