# Function Codes

In [None]:
def dataCleaning(df, code=True, tips=False, orientation=True, formatIssues=True, missingValues=True, duplicateValues=True, outliers=True):
    """
    df: your dataframe

    code: A text template to note your observations as you go. Use the code snippets included in the output. copy-paste into vscode/notepad

    tips: Provides snippets of code to help you clean potential issues in your df. If you prefer this to code
    
    orientation: Provides information about the shape/objects of your data
    
    formatIssues: Provides detailed information on each column to help identify format issues
    
    missingValues: Provides information on missing values
    
    duplicateValues: Provides information on duplicate values
    
    outliers: Provides information on outliers
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns

    if code==True:
        print("### CLEANING CODE:")
        print("df = dfX #Change to your df's name")
        print()
        print("#### Change column value:")
        print()
        print()
        print("#### Drop entire column:")
        print()
        print()
        print("#### Change column type:")
        print()
        print()
        print("#### Change column name:")
        print()
        print()
        print("#### Handle missing values:")
        print()
        print()
        print("#### Handle duplicate values:")
        print("# df.drop_duplicates(inplace=True) # drop ALL duplicate rows")
        print()
        print("#### Drop outliers:")
        print()
        print()
        print("#### Other observations / further investigations:")
        print("#")
        print("#")
        print("#")
        print()
        print("df.head() #Final Review")
        print("# dfX = df #Change to your df's name")
        print()
        print("=========================================")
    
    if orientation==True:
        print("ORIENTATION")
        print(df.info())
        print("=========================================")
        print()
        
    
    if formatIssues==True:
        print("FORMAT ISSUES")
        print()
        for col in df.columns:
            if df[col].dtype == 'object' or df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'datetime64':
            #if df[col].dtype == 'float64':

                print("df.rename(columns={'" + col + "': ''}, inplace=True)", "#rename column")
                print("df['" + col + "'] = df['" + col + "'].replace('old_value', 'new_value')")
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print("df.drop('" + col + "', axis=1, inplace=True)")                
                pd.set_option('display.max_rows', None)
                print(df.groupby(col, sort=True).size())
                pd.reset_option('display.max_rows')
                #display the dtypes of the column
                print("Current Column DType: ", df[col].dtype, "     Do not compare with above. This one will always return int64 as it's the dtype of the count")                
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print()
            #else:
            #    print(col)
            #    print(df[col].describe())
            #    print()

        if tips==True:
            print("TIPS")
            print("To make a correction to a column, use the following syntax:")
            print("df['A'] = df['A'].apply(lambda x: x.replace('old_value', 'new_value'))")
            print()
            print("To change a column data type, use the following syntax:")
            print("df['A'] = pd.to_datetime(df['A']) # for datetime")
            print("df['A'] = df['A'].astype('int64') # for integers")
            print("df['A'] = df['A'].astype('float64') # for floats")
            print("df['A'] = df['A'].astype('category') # for categorical")
            print("df['A'] = df['A'].astype('object') # for object")
            print()
        print("=========================================")
        print()

    if missingValues==True:
        print("MISSING VALUES")
        print()
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                print(col, ":", df[col].isnull().sum(), " missing values")
                print("df.dropna(subset=['" + col + "'], inplace=True)")
                print("df['" + col + "'].fillna(df['" + col + "'].mean(), inplace=True) #fill NA entries with the mean")
                print("df['" + col + "'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
                print()
                print(df.loc[df[col].isnull()].head())
                print()
            else:
                print(col, ": No missing values")
                print()
                                    
        if tips==True:
            print()
            print("TIPS")
            print("You can drop rows with missing values using one of the following code:")
            print("df.dropna(subset=['col'], inplace=True) #For a single column")
            print("df.dropna(inplace=True) #For all columns")
            print()
            print("You can fill rows with missing values using one of the following code:")
            print("df['col'].fillna(df['col'].mean(), inplace=True) #fill NA entries with the mean")
            print("df['col'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
            print("df['col'].fillna(method='ffill') # forward-fill to propagate the previous value forward")
            print("df['col'].fillna(method='bfill' # back-fill to propagate the next values backward)")
            print()
            print("To view them:")
            print("df.loc[df[col].isnull()].head()")
            print()
        print("=========================================")
        print()

    if duplicateValues==True:
        print("DUPLICATE VALUES")
        print()
        print(df[df.duplicated()].head())
        print()

        if tips==True:
            print("TIPS")
            print("You can drop duplicate rows using the following code:")
            print("df.drop_duplicates(inplace=True)")
            print("df.drop_duplicates(subset=['col'], inplace=True) #For a single column")
            print()
            print("To view them:")
            print("df[df.duplicated()].head()")
            print()
    
        print("=========================================")
        print()

    if outliers==True:
        print("OUTLIERS")
        print()
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                print(col)
                print("-----")
                print("Outlier(s):")
                print("Below ", df[col].mean() - 3*df[col].std(), " -> ", df[df[col] < df[col].mean() - 3*df[col].std()].shape[0], " low outlier(s)")
                print("Above ", df[col].mean() + 3*df[col].std(), " -> ", df[df[col] > df[col].mean() + 3*df[col].std()].shape[0], " high outlier(s)")
                low = df[col].mean() - 3*df[col].std()
                high = df[col].mean() + 3*df[col].std()
                print("df = df[(df['" + col + "'] > " + str(low) + ") & (df['" + col + "'] < " + str(high) + ")]")
                print()
                print(df[col].describe())
                print()
                print("Boxplot")
                sns.boxplot(df[col])
                plt.show()
                print()
                print("Histogram")
                sns.histplot(df[col])
                plt.show()
                print("=========================================")
                print()

        if tips==True:
            print("TIPS")
            print("You can drop outliers using the following code:")
            print("df = df[(df['column'] > lower_bound) & (df['column'] < upper_bound)]")
            print()

# Import

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", 120)

In [96]:
flights_sample = pd.read_csv("../../data/processed/flights_sample.csv")

#passengers = pd.read_csv("../../data/raw/passengers_w_departuresPerformed(29Nov).csv") #In the first step noticed this was not properly done so made the new one below
passengers = pd.read_csv("../../data/raw/passengers_w_departuresPerformed_groupedbyMonth(29Nov).csv")

passengers:

This CSV was produced using the following SQL Query:

SELECT CONCAT(year, '_', month, '-', unique_carrier, '-', origin, '-', dest) AS routeId
	, payload / departures_performed AS averagePayload_lbs
	, freight / departures_performed AS averageFreight_lbs
	, mail / departures_performed AS averageMail_lbs
	, seats / departures_performed AS availableSeats
	, passengers / departures_performed AS averagePassengers
	--, passengers / seats * 100 AS averageSeatsoccupied_perc
	, aircraft_group AS aircraftGroup
	, aircraft_type AS aircraftType
	, aircraft_config AS aircraftConfiguration
	, distance_group AS distanceInterval_x500mi
	, class AS serviceClass
FROM passengers
WHERE departures_performed > 0

# Keep only relevant rows (Take 1 - Archived)

The passengers table is huge, it likely has information on routes that we don't need. Let's slim it down

In [67]:
passengers.shape

(2334701, 11)

In [59]:
###Create the route_ID column for flights_sample
flights_sample['routeId_op'] = flights_sample['Flight Year'].astype(str) + '_' + flights_sample['Flight Month'].astype(str) + '-' + flights_sample['Operator - Unique Carrier Code'] + '-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Destination Airport (IATA Code)']
flights_sample['routeId_mkt'] = flights_sample['Flight Year'].astype(str) + '_' + flights_sample['Flight Month'].astype(str) + '-' + flights_sample['Marketer - Unique Carrier Code'] + '-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Destination Airport (IATA Code)']

Made 2x version, as not sure if the passengers' table is referring to the Operator or the Marketer.. we'll test both

In [60]:
# List the number of unique flights_sample['routeId_op'] values that are in the passengers['routeid'] column using the isin() method, ensuring the values are stored in a df, not a numpy arrai
#flight_op = pd.DataFrame()
#flight_mkt = pd.DataFrame()
#unique_passengers = pd.DataFrame() 

flight_op['routeId_op'] = flights_sample['routeId_op'].unique()
flight_mkt['routeId_mkt'] = flights_sample['routeId_mkt'].unique()
unique_passengers['routeid'] = passengers['routeid'].unique()

# Count the number of unique flights_sample['routeId_op'] values that are in the passengers['routeid'] column using the isin() method, ensuring the values are stored in a df, not a numpy array
print("Flight Operator")
print("Total:", len(flight_op))
print("In passenger:", flight_op['routeId_op'].isin(unique_passengers['routeid']).sum())
print()
print("Flight Marketer")
print("Total:", len(flight_mkt))
print("In passenger:", flight_mkt['routeId_mkt'].isin(unique_passengers['routeid']).sum())
print()
print("Passengers Table") #Double tab but just to be sure..
print("Total:", len(unique_passengers))
print("In Marketer:", unique_passengers['routeid'].isin(flight_mkt['routeId_mkt']).sum())
print("In Operator:", unique_passengers['routeid'].isin(flight_op['routeId_op']).sum())

Flight Operator
Total: 112012
In passenger: 111997

Flight Marketer
Total: 102429
In passenger: 72961

Passengers Table
Total: 1479024
In Marketer: 72961
In Operator: 111997


Ok so the Flight Operator has more results, let's see the route that are not in the passengers table

In [61]:
print(flights_sample[~flights_sample['routeId_op'].isin(unique_passengers['routeid'])].shape)
flights_sample[~flights_sample['routeId_op'].isin(unique_passengers['routeid'])].head(10)

(26, 39)


Unnamed: 0,Flight Year,Flight Month,Flight Day,Flight Weekday,Marketer - Unique Carrier Code,Operator - Unique Carrier Code,Different Marketer & Operator Carrier Code,Tail Number,Flight Number,Origin Airport (ID),Origin Airport (IATA Code),"Origin Airport (City, State)",Destination Airport (ID),Destination Airport (IATA Code),"Destination Airport (City, State)",Departure Delay (minutes),Arrival Delay (minutes),Cancelled,Cancellation Code,Diverted,Scheduled Departure Time (local time),Actual Departure Time (local time),Taxi Out (minutes),Wheels Off (local time),Wheels On (local time),Taxi In (minutes),Scheduled Arrival Time (local time),Actual Arrival Time (local time),Scheduled Elapsed Time,Actual Elapsed Time,Air Time,Distance (miles),Carrier Delay (minutes),Weather Delay (minutes),National Air System Delay (minutes),Security Delay (minutes),Late Aircraft Delay (minutes),routeId_op,routeId_mkt
1085,2019,11,5,1,HA,EM,1,N806HC,648,12173,HNL,"Honolulu, HI",12492,JHM,"Kapalua, HI",-3,-3,0,,0,20:21,20:18,00:12,20:30,20:53,00:03,20:59,20:56,38,38,23,84,0,0,0,0,0,2019_11-EM-HNL-JHM,2019_11-HA-HNL-JHM
10742,2019,11,3,6,HA,EM,1,N801HC,673,13347,MKK,"Hoolehua, HI",12173,HNL,"Honolulu, HI",82,82,0,,0,06:43,09:25,00:03,09:28,09:46,00:08,07:52,09:54,29,29,18,54,3,0,0,0,79,2019_11-EM-MKK-HNL,2019_11-HA-MKK-HNL
14230,2018,9,21,4,NK,NK,0,N627NK,1455,10529,BDL,"Hartford, CT",15304,TPA,"Tampa, FL",-11,-23,0,,0,18:40,18:29,00:10,18:39,22:31,00:05,23:39,22:36,179,167,152,1111,0,0,0,0,0,2018_9-NK-BDL-TPA,2018_9-NK-BDL-TPA
17621,2018,10,16,1,HA,EM,1,N804HC,610,12173,HNL,"Honolulu, HI",13347,MKK,"Hoolehua, HI",-7,-11,0,,0,10:55,10:48,,,,,12:05,11:54,30,26,0,54,0,0,0,0,0,2018_10-EM-HNL-MKK,2018_10-HA-HNL-MKK
18445,2018,10,28,6,HA,EM,1,N804HC,654,12173,HNL,"Honolulu, HI",12492,JHM,"Kapalua, HI",-3,-8,0,,0,03:18,03:15,,,,,04:36,04:28,38,33,0,84,0,0,0,0,0,2018_10-EM-HNL-JHM,2018_10-HA-HNL-JHM
37388,2019,3,3,6,UA,EV,1,N13975,4152,13930,ORD,"Chicago, IL",14100,PHL,"Philadelphia, PA",0,0,1,A,0,13:20,,,,,,17:38,,118,0,0,678,0,0,0,0,0,2019_3-EV-ORD-PHL,2019_3-UA-ORD-PHL
52425,2019,11,19,1,HA,EM,1,N805HC,653,13347,MKK,"Hoolehua, HI",12173,HNL,"Honolulu, HI",9,8,0,,0,01:38,01:47,00:05,01:52,02:52,00:03,02:47,02:55,29,28,20,54,0,0,0,0,0,2019_11-EM-MKK-HNL,2019_11-HA-MKK-HNL
58174,2018,10,16,1,HA,EM,1,N804HC,611,13347,MKK,"Hoolehua, HI",12173,HNL,"Honolulu, HI",-12,-8,0,,0,12:31,12:19,,,,,13:39,13:31,28,32,0,54,0,0,0,0,0,2018_10-EM-MKK-HNL,2018_10-HA-MKK-HNL
58898,2018,10,15,0,HA,EM,1,N804HC,613,13034,LNY,"Lanai, HI",12173,HNL,"Honolulu, HI",73,70,0,,0,15:49,18:22,,,,,17:01,18:51,32,29,0,72,0,0,0,0,70,2018_10-EM-LNY-HNL,2018_10-HA-LNY-HNL
68622,2019,11,12,1,HA,EM,1,N801HC,645,13034,LNY,"Lanai, HI",12173,HNL,"Honolulu, HI",-16,-19,0,,0,20:05,19:09,00:07,19:16,20:17,00:04,20:40,20:21,35,32,21,72,0,0,0,0,0,2019_11-EM-LNY-HNL,2019_11-HA-LNY-HNL


This just made me notice something.. we don't have the data for 2020.. so we need to consolidate this information on a monthly thing. We'll need to reformat our route_ID to MM-carrier-origin-dest and then group the info of every year to either an average or a min/max where appropriate..

# Tweaking the passengers table

In [84]:
# Let's remove the Year of the routeId column
passengers = pd.read_csv("../../data/raw/passengers_w_departuresPerformed(29Nov).csv")

passengers['routeid_2'] = passengers['routeid'].str[5:]
passengers.head()

Unnamed: 0,routeid,averagepayload_lbs,averagefreight_lbs,averagemail_lbs,availableseats,averagepassengers,aircraftgroup,aircrafttype,aircraftconfiguration,distanceinterval_x500mi,serviceclass,routeid_2
0,2019_6-LH-IAD-MUC,98368.6,6791.7,0.0,293.0,274.7,6,359,1,9,F,6-LH-IAD-MUC
1,2019_6-LH-IAH-FRA,138113.8,13347.633333,0.0,509.0,483.733333,8,882,1,11,F,6-LH-IAH-FRA
2,2019_6-LH-IAH-FRA,187000.0,113393.0,0.0,0.0,0.0,7,740,2,11,G,6-LH-IAH-FRA
3,2019_6-LH-JAX-ATL,187000.0,164936.0,0.0,0.0,0.0,7,740,2,1,G,6-LH-JAX-ATL
4,2019_6-LH-JFK-FRA,107934.0,8802.0,0.0,371.0,354.0,8,819,1,8,F,6-LH-JFK-FRA


In [85]:
#Lets see the duplicates
print("Total rows:", passengers['routeid_2'].count())
print("Unique values:", passengers['routeid_2'].nunique())
print("Duplicate values:", passengers['routeid_2'].duplicated().sum())

Total rows: 2334701
Unique values: 612051
Duplicate values: 1722650


In [None]:
So each rows have about 2-3x values.. 

In [86]:
# So now we need to group by routeid and avg the values for averagepayload_lbs, averagefreight_lbs, averagemail_lbs, availableseats, averagepassengers. 
# We'll also keep the most recent value (year 2019) for aircraftgroup, aircrafttype, aircraftconfiguration, distanceinterval_x500mi, serviceclass
# I'll be honest I don't know how to deal with the binary columns to do them in one shot So I'll filter the passengers table to only include 2019 values and then group each df by routeid and then merge them back together
print("Initial size passengers:", passengers.shape)

# Separating the binary columns and grouping them
passengers_2019 = passengers[passengers['routeid'].str.startswith('2019')]
passengers_2019 = passengers_2019.drop(['averagepayload_lbs', 'averagefreight_lbs', 'averagemail_lbs', 'availableseats', 'averagepassengers'], axis=1)
print("Initial size passengers_2019:", passengers_2019.shape)
passengers_2019 = passengers_2019.groupby('routeid_2').agg({'routeid' : 'last', 'aircraftgroup': 'last', 'aircrafttype': 'last', 'aircraftconfiguration': 'last', 'distanceinterval_x500mi': 'last', 'serviceclass': 'last'}).reset_index()
passengers_2019 = passengers_2019.drop_duplicates(subset=['routeid_2'], keep='last') #drop duplicate rows

print("Final size passengers_2019:", passengers_2019.shape) #just to se if we actually did anything

# Delete the binary columns from the passengers table
passengers = passengers.drop(['aircraftgroup', 'aircrafttype', 'aircraftconfiguration', 'distanceinterval_x500mi', 'serviceclass'], axis=1)

# Merge the route_Id and average the other values
passengers.groupby('routeid_2').agg({'routeid' : 'last', 'averagepayload_lbs': 'mean', 'averagefreight_lbs': 'mean', 'averagemail_lbs': 'mean', 'availableseats': 'mean', 'averagepassengers': 'mean'}).reset_index()
passengers = passengers.drop_duplicates(subset=['routeid_2'], keep='last') #drop duplicate rows

# Add the binary columns back to the passengers table
passengers = passengers.merge(passengers_2019, on='routeid_2', how='left')
print("Final size passengers:", passengers.shape)

Initial size passengers: (2334701, 12)
Initial size passengers_2019: (485684, 7)
Final size passengers_2019: (304430, 7)
Final size passengers: (612051, 13)


We're matching our unique values! DOUBLE BAM!

In [87]:
# let's drop our original route_Ids column and rename the routeid_2 column to routeid
passengers = passengers.drop(['routeid_x', 'routeid_y'], axis=1)
passengers = passengers.rename(columns={'routeid_2': 'routeid'})

In [89]:
#Reorder columns
passengers = passengers[['routeid', 'averagepayload_lbs', 'averagefreight_lbs', 'averagemail_lbs', 'availableseats', 'averagepassengers', 'aircraftgroup', 'aircrafttype', 'aircraftconfiguration', 'distanceinterval_x500mi', 'serviceclass']]

In [91]:
passengers.to_csv("../../data/raw/passengers_w_departuresPerformed_groupedbyMonth(29Nov).csv")

# Keep only relevant rows (Take 2 - The good one)

The passengers table is huge, it likely has information on routes that we don't need. Let's slim it down

In [97]:
passengers.shape

(612051, 12)

In [98]:
###Create the route_ID column for flights_sample
flights_sample['routeId_op'] = flights_sample['Flight Month'].astype(str) + '-' + flights_sample['Operator - Unique Carrier Code'] + '-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Destination Airport (IATA Code)']
flights_sample['routeId_mkt'] = flights_sample['Flight Month'].astype(str) + '-' + flights_sample['Marketer - Unique Carrier Code'] + '-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Destination Airport (IATA Code)']

Made 2x version, as not sure if the passengers' table is referring to the Operator or the Marketer.. we'll test both

In [100]:
# List the number of unique flights_sample['routeId_op'] values that are in the passengers['routeid'] column using the isin() method, ensuring the values are stored in a df, not a numpy arrai
flight_op = pd.DataFrame()
flight_mkt = pd.DataFrame()
unique_passengers = pd.DataFrame() 

flight_op['routeId_op'] = flights_sample['routeId_op'].unique()
flight_mkt['routeId_mkt'] = flights_sample['routeId_mkt'].unique()
unique_passengers['routeid'] = passengers['routeid'].unique()

# Count the number of unique flights_sample['routeId_op'] values that are in the passengers['routeid'] column using the isin() method, ensuring the values are stored in a df, not a numpy array
print("Flight Operator")
print("Total:", len(flight_op))
print("In passenger:", flight_op['routeId_op'].isin(unique_passengers['routeid']).sum())
print()
print("Flight Marketer")
print("Total:", len(flight_mkt))
print("In passenger:", flight_mkt['routeId_mkt'].isin(unique_passengers['routeid']).sum())
print()
print("Passengers Table") #Double tab but just to be sure..
print("Total:", len(unique_passengers))
print("In Marketer:", unique_passengers['routeid'].isin(flight_mkt['routeId_mkt']).sum())
print("In Operator:", unique_passengers['routeid'].isin(flight_op['routeId_op']).sum())

Flight Operator
Total: 81962
In passenger: 81961

Flight Marketer
Total: 69366
In passenger: 53140

Passengers Table
Total: 612051
In Marketer: 53140
In Operator: 81961


Excellent, we're 100% for Flight Operator. We'll use this one

In [101]:
flights_sample = flights_sample.drop(['routeId_mkt'], axis=1)
flights_sample = flights_sample.rename(columns={'routeId_op': 'routeId'})

NEXT STEP:

Add the passenger enrichment to the rows in flights_sample

# Clean

LEAVE FORMATISSUES TO FALSE. There is over 3 million lines, this will lead to problems...

In [12]:
#dataCleaning(passengers, formatIssues=False)

It's too much to handle.. Before it crashed noticed a significant amount of outliers in the payload/freight/mail columns.. will need to further investigate.

In [5]:
passengers.info()
#passengers.head(25)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334701 entries, 0 to 2334700
Data columns (total 11 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   routeid                  object 
 1   averagepayload_lbs       float64
 2   averagefreight_lbs       float64
 3   averagemail_lbs          float64
 4   availableseats           float64
 5   averagepassengers        float64
 6   aircraftgroup            int64  
 7   aircrafttype             int64  
 8   aircraftconfiguration    int64  
 9   distanceinterval_x500mi  int64  
 10  serviceclass             object 
dtypes: float64(5), int64(4), object(2)
memory usage: 195.9+ MB


Data Types appear GTG

In [14]:
passengers.describe()

Unnamed: 0,averagepayload_lbs,averagefreight_lbs,averagemail_lbs,availableseats,averagepassengers,aircraftgroup,aircrafttype,aircraftconfiguration,distanceinterval_x500mi
count,2334701.0,2334701.0,2334701.0,2334701.0,2334701.0,2334701.0,2334701.0,2334701.0,2334701.0
mean,42557.5,7414.158,160.2968,101.6929,78.64157,5.508221,605.4888,1.213319,2.634128
std,51021.8,26907.91,1481.311,82.6326,69.90261,1.540163,166.6219,0.5267736,2.477123
min,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,1.0
25%,13500.0,0.0,0.0,9.0,4.0,6.0,614.0,1.0,1.0
50%,34446.0,0.0,0.0,77.8,65.11111,6.0,631.0,1.0,2.0
75%,43400.0,203.6279,0.0,160.0,132.4348,6.0,691.0,1.0,3.0
max,550000.0,461447.8,242547.0,7369.0,529.0,8.0,890.0,4.0,21.0


In [6]:
passengers.shape

(2334701, 11)

# Export to CSV

In [7]:
#passengers.to_csv('../../data/processed/flights_enrichment_passengers.csv',index=False)