# Make sure you are using the latest version of this script. Check here if there have been any changes since your last download: github

# 1. In the file browser (left) navigate to the directory with the file you would like to convert
# 2. Right-click on the file and select "Copy Path"
# 3. Insert the path in the next cell (don't add the file extension .csv)
# 4. Click on "Run" and select "Run all cells" . The transformed CSV will be saved in the same directory. If it crashes, message Daniel ;) 

In [29]:
filename = "C:/Users/Daniel.Busch/OneDrive - wwfgermany/Dokumente/data science/pilot/case_study/Sample Travel data"


# read in the libraries and file

In [30]:
# all of the libraries are pre-installed
# --> no need to pip-install anything
import pandas as pd
import numpy as np
import time
from datetime import datetime
import logging

# measure the time to assess the computation time
start = time.time()

In [31]:
# read in the Plan A template for transport
df_template = pd.read_csv("C:/Users/Daniel.Busch/OneDrive - wwfgermany/Dokumente/data science/pilot/case_study/business_travel_template.csv", sep=";")

In [32]:
# read in the data from the customer
df_or = pd.read_csv(f"{filename}.csv",
                    parse_dates=['Start date', 'End date'])

# perform a mini exploratory data analysis

In [33]:

print(f'csv size: {df_or.shape}')
print(f"columns: {list(df_or.columns)}")
print(f"services: {df_or.groupby(['Service'])['Service'].count()}")

csv size: (55, 13)
columns: ['Booking description', 'Service', 'Departure airport code', 'Arrival airport code', 'Cabin class', 'Drop-off city', 'Pick-up city', 'End date', 'Start date', 'Distance', 'Itinerary Type', 'Departure Train Station', 'Arrival Train Station']
services: Service
FlexiPerk Trips Service     1
Flights                     6
Hotels                      9
Other Service              10
Premium Service            16
Refund for hotel            1
Trains                     12
Name: Service, dtype: int64


# convert return trips to two individual trips

In [34]:
# first, identify all return trips based on the key word "return"
returns = df_or.loc[df_or['Itinerary Type'] == 'return'].copy()
# swap the airport codes
airport_codes = ['Departure airport code', 'Arrival airport code']
returns = returns.rename(columns={
    airport_codes[0]: airport_codes[1],
    airport_codes[1]: airport_codes[0]})
# swap train stations
train_stations = ['Departure Train Station', 'Arrival Train Station']
returns = returns.rename(columns={
    train_stations[0]: train_stations[1],
    train_stations[1]: train_stations[0]})
#  make the arrival date to new departure date
travel_dates = ['End date', 'Start date']
returns = returns.rename(columns={
    travel_dates[0]: travel_dates[1],
    travel_dates[1]: travel_dates[0]})
# # stack the two DataFrames returns and the original data frame
df_or = pd.concat([df_or, returns], ignore_index=True, axis=0)


# create a data frame with all flights

In [35]:
# FLIGHTS
# filter by flights
flights_or = df_or.loc[(df_or["Service"] == "Flights")]
# create a new data frame according to the template
flights = pd.DataFrame(
    np.nan, index=range(len(flights_or)), columns=df_template.columns)
# add the booking description
flights.loc[:, "Transport type"] = "Flight"
# add the departure date
flights.loc[:, "Departure date"] = flights_or[
    "Start date"].reset_index(drop=True)
# add the cabin classes
flights.loc[
    :, "Flight classes (Economy / Business / First / Unknown)"] = flights_or[
        "Cabin class"].reset_index(drop=True)
# rename the flight classes to the correct key word
# TODO! key words in client's data for Business and First unknown at this point
flights.loc[
    :, "Flight classes (Economy / Business / First / Unknown)"].replace(
    {"economy": "Economy"}, inplace=True)
# fill empty ones with unknown
flights.loc[
    :, "Flight classes (Economy / Business / First / Unknown)"].fillna(
    "Unknown", inplace=True)
# add airport code for departure
flights.loc[
    :, "Origin Location (Flight / Road / Rail)"] = flights_or[
        "Departure airport code"].reset_index(drop=True)
# add airport code for destination
flights.loc[
    :, "Destination Location (Flight / Road / Rail)"] = flights_or[
        "Arrival airport code"].reset_index(drop=True)
# test if all airport codes are available
if flights.loc[:, ["Origin Location (Flight / Road / Rail)",
                   "Destination Location (Flight / Road / Rail)"]
               ].isna().any().any():
    logging.warning(' Airport codes incomplete')


# create a data frame with all train rides

In [36]:
# filter by rail
rail_or = df_or.loc[(df_or["Service"] == "Trains")]
# create a new data frame according to the template
rail = pd.DataFrame(
    np.nan, index=range(len(rail_or)), columns=df_template.columns)
# add the booking description
rail.loc[:, "Transport type"] = "Rail"
# add the departure date
rail.loc[:, "Departure date"] = rail_or["Start date"].reset_index(drop=True)
# add train station for departure
rail.loc[:, "Origin Location (Flight / Road / Rail)"] = rail_or[
    "Departure Train Station"].reset_index(drop=True)
# add train station for destination
rail.loc[:, "Destination Location (Flight / Road / Rail)"] = rail_or[
    "Arrival Train Station"].reset_index(drop=True)
# test if all train stations are available
if rail.loc[:, ["Origin Location (Flight / Road / Rail)",
                "Destination Location (Flight / Road / Rail)"]
            ].isna().any().any():
    logging.warning(' Train stations codes incomplete')


# concatenate the data frames for flights and rail

In [37]:
# %% stack the data frames for flights and rail
df = pd.concat([flights, rail], ignore_index=True, axis=0)
# set all passenger per trip to one
df['Number of employees'] = df['Number of employees'].fillna(1)
# adjust date formate
df['Departure date'] = df['Departure date'].dt.strftime('%Y-%m-%d')
# test if departure dates are complete
if df.loc[:, "Departure date"].isna().any():
    logging.warning(" Departure dates incomplete!")


# save the csv file in the same directory

In [38]:
# %% save csv
# create a unique name containing the current time and the original filename
now = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
filename_out = f"{filename}_converted_{now}.csv"#.replace(" ", "_")
# save the csv
df.to_csv(filename_out)

In [39]:
# this measures the time the code took to execute
# note that this only makes sense when all cells are run at once
end = time.time()
print(f"This script took {end - start} to run")

This script took 0.20262432098388672 to run
