Requirements

1. Input the data
2. Split the Flight Details field to form: Date, Flight Number, From, To, Class, Price
3. Convert the following data fields to the correct data types: Date to a date format, Price to a decimal value
4. Change the Flow Card field to Yes / No values instead of 1 / 0
5. Create two tables, one for Flow Card holders and one for non-Flow Card holders
6. Output the data sets

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('Preppin Data Inputs/PD 2024 Wk 1 Input.csv')

In [4]:
df

Unnamed: 0,Flight Details,Flow Card?,Bags Checked,Meal Type
0,2024-07-22//PA010//Tokyo-New York//Economy//2380,1,0,Egg Free
1,2024-09-28//PA008//Perth-New York//Economy//1855,0,2,Vegetarian
2,2024-04-20//PA002//New York-London//Economy//3490,1,1,Vegan
3,2024-01-23//PA010//Tokyo-New York//Premium Eco...,1,1,Vegetarian
4,2024-10-01//PA008//Perth-New York//Business Cl...,0,0,Vegetarian
...,...,...,...,...
3773,2024-05-05//PA009//New York-Tokyo//Economy//1360,0,3,Nut Free
3774,2024-06-14//PA008//Perth-New York//First Class...,0,1,Dairy Free
3775,2024-01-16//PA010//Tokyo-New York//Economy//2410,0,2,Egg Free
3776,2024-08-16//PA005//London-Tokyo//Premium Econo...,0,0,Nut Free


In [5]:
# 2. Splitting the flight details field to form other fields

df['Date'] = df['Flight Details'].str.split('//').str[0]
df['Flight Number'] = df['Flight Details'].str.split('//').str[1]

df['To & From'] = df['Flight Details'].str.split('//').str[2]
df['To'] = df['To & From'].str.split('-').str[0]
df['From'] = df['To & From'].str.split('-').str[1]

df['Class'] = df['Flight Details'].str.split('//').str[3]
df['Price'] = df['Flight Details'].str.split('//').str[4]

df = df.drop('Flight Details', axis=1)
df = df.drop('To & From', axis=1)

In [6]:
df

Unnamed: 0,Flow Card?,Bags Checked,Meal Type,Date,Flight Number,To,From,Class,Price
0,1,0,Egg Free,2024-07-22,PA010,Tokyo,New York,Economy,2380
1,0,2,Vegetarian,2024-09-28,PA008,Perth,New York,Economy,1855
2,1,1,Vegan,2024-04-20,PA002,New York,London,Economy,3490
3,1,1,Vegetarian,2024-01-23,PA010,Tokyo,New York,Premium Economy,825
4,0,0,Vegetarian,2024-10-01,PA008,Perth,New York,Business Class,634.79999999999995
...,...,...,...,...,...,...,...,...,...
3773,0,3,Nut Free,2024-05-05,PA009,New York,Tokyo,Economy,1360
3774,0,1,Dairy Free,2024-06-14,PA008,Perth,New York,First Class,245
3775,0,2,Egg Free,2024-01-16,PA010,Tokyo,New York,Economy,2410
3776,0,0,Nut Free,2024-08-16,PA005,London,Tokyo,Premium Economy,960


In [7]:
# 3. Convert the following data fields to the correct data types: Date to a date format, Price to a decimal value

df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df['Price'] = df['Price'].astype(float)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3778 entries, 0 to 3777
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Flow Card?     3778 non-null   int64         
 1   Bags Checked   3778 non-null   int64         
 2   Meal Type      3189 non-null   object        
 3   Date           3778 non-null   datetime64[ns]
 4   Flight Number  3778 non-null   object        
 5   To             3778 non-null   object        
 6   From           3778 non-null   object        
 7   Class          3778 non-null   object        
 8   Price          3778 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 265.8+ KB


In [9]:
# 4. Change the Flow Card field to Yes / No values instead of 1 / 0

df['Flow Card?'] = df['Flow Card?'].apply(lambda x: 'Yes' if x == 1 else 'No')

In [10]:
df

Unnamed: 0,Flow Card?,Bags Checked,Meal Type,Date,Flight Number,To,From,Class,Price
0,Yes,0,Egg Free,2024-07-22,PA010,Tokyo,New York,Economy,2380.0
1,No,2,Vegetarian,2024-09-28,PA008,Perth,New York,Economy,1855.0
2,Yes,1,Vegan,2024-04-20,PA002,New York,London,Economy,3490.0
3,Yes,1,Vegetarian,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0
4,No,0,Vegetarian,2024-10-01,PA008,Perth,New York,Business Class,634.8
...,...,...,...,...,...,...,...,...,...
3773,No,3,Nut Free,2024-05-05,PA009,New York,Tokyo,Economy,1360.0
3774,No,1,Dairy Free,2024-06-14,PA008,Perth,New York,First Class,245.0
3775,No,2,Egg Free,2024-01-16,PA010,Tokyo,New York,Economy,2410.0
3776,No,0,Nut Free,2024-08-16,PA005,London,Tokyo,Premium Economy,960.0


In [11]:
# Create two tables, one for Flow Card holders and one for non-Flow Card holders

table1 = df[df['Flow Card?'] == 'Yes']
table2 = df[df['Flow Card?'] == 'No']

In [12]:
table1

Unnamed: 0,Flow Card?,Bags Checked,Meal Type,Date,Flight Number,To,From,Class,Price
0,Yes,0,Egg Free,2024-07-22,PA010,Tokyo,New York,Economy,2380.0
2,Yes,1,Vegan,2024-04-20,PA002,New York,London,Economy,3490.0
3,Yes,1,Vegetarian,2024-01-23,PA010,Tokyo,New York,Premium Economy,825.0
6,Yes,3,Vegan,2024-06-05,PA006,Tokyo,London,First Class,618.0
8,Yes,1,Nut Free,2024-03-30,PA004,Perth,London,First Class,446.0
...,...,...,...,...,...,...,...,...,...
3764,Yes,2,Egg Free,2024-11-23,PA005,London,Tokyo,Economy,2070.0
3766,Yes,3,Nut Free,2024-11-04,PA003,London,Perth,First Class,210.0
3770,Yes,0,Dairy Free,2024-04-29,PA012,Tokyo,Perth,Economy,3490.0
3772,Yes,2,Vegetarian,2024-09-26,PA001,London,New York,First Class,207.0


In [13]:
table2

Unnamed: 0,Flow Card?,Bags Checked,Meal Type,Date,Flight Number,To,From,Class,Price
1,No,2,Vegetarian,2024-09-28,PA008,Perth,New York,Economy,1855.0
4,No,0,Vegetarian,2024-10-01,PA008,Perth,New York,Business Class,634.8
5,No,3,Nut Free,2024-03-04,PA007,New York,Perth,Business Class,458.4
7,No,0,,2024-02-25,PA010,Tokyo,New York,Premium Economy,1435.0
13,No,2,Vegan,2024-03-29,PA004,Perth,London,Economy,2730.0
...,...,...,...,...,...,...,...,...,...
3771,No,2,Vegetarian,2024-03-06,PA006,Tokyo,London,Premium Economy,940.0
3773,No,3,Nut Free,2024-05-05,PA009,New York,Tokyo,Economy,1360.0
3774,No,1,Dairy Free,2024-06-14,PA008,Perth,New York,First Class,245.0
3775,No,2,Egg Free,2024-01-16,PA010,Tokyo,New York,Economy,2410.0


In [14]:
# 6. Output the datasets

table1.to_csv('Preppin Data Outputs/pd2024wk1_output1.csv', index=False)
table2.to_csv('Preppin Data Outputs/pd2024wk1_output2.csv', index=False)