# MSiA 423 - Cloud Engineering for Data Science - Final Project
## Group 7: Alejandra Lelo de Larrea Ibarra, Bannasorn Paspanthong, Ruben Nakano, Samuel Swain
# Generate Raw data

In [1]:
# Libraries 
import numpy as np
import pandas as pd
import zipfile
import time
import re
from datetime import datetime

In [2]:
# --- Read csv file directly from the zip ---
archive = zipfile.ZipFile('../02_Data/archive.zip', 'r')
files = archive.namelist()

# --- Load data sets --- 
# Clean
with archive.open(files[0]) as csvfile:   
    df = pd.read_csv(csvfile)
    print("df (clean) data shape: " + str(df.shape))
    
# Business class 
with archive.open(files[1]) as csvfile:   
    df_business = pd.read_csv(csvfile)
    df_business["class"] = "Business"
    print("Buisness data shape: " + str(df_business.shape))
    
# Economy class 
with archive.open(files[2]) as csvfile:   
    df_economy = pd.read_csv(csvfile)
    df_economy["class"] = "Economy"
    print("Economy data shape: " + str(df_economy.shape))
    
# --- Bind business and economy class --- 
df_raw = pd.concat([df_business,df_economy], axis = 0)
print("Raw data shape: " + str(df_raw.shape))


df (clean) data shape: (300153, 12)
Buisness data shape: (93487, 12)
Economy data shape: (206774, 12)
Raw data shape: (300261, 12)


In [3]:
# --- Print data-types ---
print("df columns: \n\n" + str(df.dtypes))
print("\n\n\ndf_raw columns: \n\n" + str(df_raw.dtypes))

df columns: 

Unnamed: 0            int64
airline              object
flight               object
source_city          object
departure_time       object
stops                object
arrival_time         object
destination_city     object
class                object
duration            float64
days_left             int64
price                 int64
dtype: object



df_raw columns: 

date          object
airline       object
ch_code       object
num_code       int64
dep_time      object
from          object
time_taken    object
stop          object
arr_time      object
to            object
price         object
class         object
dtype: object


In [4]:
# --- Clean df to be compatible with df_raw ---

# Select columns from df to merge with df_clean
df = df[["airline", "flight", "source_city", "destination_city", "class", "days_left"]]
df[["ch_code", "num_code"]] = df["flight"].str.split("-", expand = True)
df["num_code"] = df["num_code"].astype(int)

# Drop flight
df.drop("flight", axis = 1, inplace = True)

# Drop duplicates 
df.drop_duplicates(inplace = True)

# Rename columns 
df.columns = ["airline", "from", "to", "class","days_left", "ch_code", "num_code"]
df.head()

Unnamed: 0,airline,from,to,class,days_left,ch_code,num_code
0,SpiceJet,Delhi,Mumbai,Economy,1,SG,8709
1,SpiceJet,Delhi,Mumbai,Economy,1,SG,8157
2,AirAsia,Delhi,Mumbai,Economy,1,I5,764
3,Vistara,Delhi,Mumbai,Economy,1,UK,995
4,Vistara,Delhi,Mumbai,Economy,1,UK,963


In [5]:
# --- Merge df to get the "days_left" column ---

# Merge df and df_raw 
df_raw = df_raw.merge(df, on = ["airline", "ch_code", "num_code", "from", "to", "class"], how = 'left')

# Remove obs with no match
df_raw = df_raw[~(df_raw["days_left"].isna())]

# Show data
df_raw.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class,days_left
5,11-02-2022,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,1.0
6,11-02-2022,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,2.0
7,11-02-2022,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,3.0
8,11-02-2022,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,4.0
9,11-02-2022,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,5.0


In [6]:
df_raw.shape

(8558597, 13)

In [7]:
# --- Get date of flight ---

# Convert to date type 
df_raw["date"] = pd.to_datetime(df_raw["date"], format='%d-%m-%Y')

# Add a new column 'flight_date' that is the 'date' plus the number of 'days_left'
df_raw['flight_date'] = df_raw['date'] + pd.to_timedelta(df_raw['days_left'], unit='d')

# Rename date column
df_raw.columns.values[0] = "booking_date"

# View raw dataset 
df_raw.head()

Unnamed: 0,booking_date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class,days_left,flight_date
5,2022-02-11,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,1.0,2022-02-12
6,2022-02-11,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,2.0,2022-02-13
7,2022-02-11,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,3.0,2022-02-14
8,2022-02-11,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,4.0,2022-02-15
9,2022-02-11,Vistara,UK,985,19:50,Delhi,02h 10m,non-stop,22:00,Mumbai,50264,Business,5.0,2022-02-16


In [8]:
# --- Save file into a CSV --- 
df_raw.to_csv("../02_Data/raw_data.csv", index = False)