Task 2

In [1]:
from pyspark import SparkContext, SparkConf
from functools import reduce
from collections import Counter
import pandas as pd

def load_dataset(filename, columnNames):
    df = pd.read_csv(filename,encoding='latin1',names=columnNames)
    return df

def get_instances(data):
    instances = data.split()
    return Counter(instances)

def ReduceCounter(counter1, counter2):
    counter1.update(counter2)
    return counter1

conf = SparkConf().setAppName('MapReduce').setMaster('local')
sparkContext = SparkContext.getOrCreate(conf=conf)

columns = ['passengerID', 'flightID', 'originAirport', 'destinationAirport', 'departureTime', 'flightTime']

rdd = load_dataset("AComp_Passenger_data_no_error.csv", columns)

Origins = rdd['originAirport'].tolist()
distributed_data_origins = sparkContext.parallelize(Origins, 10)

dist_data_flight_origins = distributed_data_origins.map(get_instances)
dist_data_flight_origins_count = dist_data_flight_origins.reduce(ReduceCounter)
print(dist_data_flight_origins_count)


24/12/03 16:33:23 WARN Utils: Your hostname, codespaces-8bb66a resolves to a loopback address: 127.0.0.1; using 10.0.11.17 instead (on interface eth0)
24/12/03 16:33:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/03 16:33:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

Counter({'DEN': 46, 'CAN': 37, 'IAH': 37, 'ATL': 36, 'ORD': 33, 'KUL': 33, 'CGK': 27, 'JFK': 25, 'LHR': 25, 'CDG': 21, 'CLT': 21, 'PVG': 20, 'LAS': 17, 'BKK': 17, 'AMS': 15, 'FCO': 15, 'MUC': 14, 'MAD': 13, 'PEK': 13, 'HND': 13, 'DFW': 11, 'MIA': 11})


                                                                                

In [19]:
#Flight ID, Count of Passengers, Departure code, Departure Time, Arrival Code, Arrival Time
from pytz import timezone
from datetime import datetime, timedelta

def GetFlightInfo(acc, record):
    flightID = record['flightID']

    if flightID not in acc:
        acc[flightID] = record
    return acc

airportTimezones = {
    "ATL": "America/New_York",
    "PEK": "Asia/Shanghai",
    "LHR": "Europe/London",
    "ORD": "America/Chicago",
    "HND": "Asia/Tokyo",
    "LAX": "America/Los_Angeles",
    "CDG": "Europe/Paris",
    "DFW": "America/Chicago",
    "FRA": "Europe/Berlin",
    "HKG": "Asia/Hong_Kong",
    "DEN": "America/Denver",
    "DXB": "Asia/Dubai",
    "CGK": "Asia/Jakarta",
    "AMS": "Europe/Amsterdam",
    "MAD": "Europe/Madrid",
    "BKK": "Asia/Bangkok",
    "JFK": "America/New_York",
    "SIN": "Asia/Singapore",
    "CAN": "Asia/Shanghai",
    "LAS": "America/Los_Angeles",
    "PVG": "Asia/Shanghai",
    "SFO": "America/Los_Angeles",
    "PHX": "America/Phoenix",
    "IAH": "America/Chicago",
    "CLT": "America/New_York",
    "MIA": "America/New_York",
    "MUC": "Europe/Berlin",
    "KUL": "Asia/Kuala_Lumpur",
    "FCO": "Europe/Rome",
    "IST": "Europe/Istanbul"
}

def CalculateTimes(acc, record):
    flightID, originAirport, departureTime, destinationAirport, flightTime = record #get parts of the record that are needed

    depRealTime = datetime.fromtimestamp(departureTime, timezone(airportTimezones[originAirport])) #apply timezone to destination
    flightDelta = timedelta(minutes=flightTime) #work out the minutes from the flight time

    arrivalTime = depRealTime + flightDelta #calculate arrival time
    arrivalTime = arrivalTime.astimezone(timezone(airportTimezones[destinationAirport])) #apply destination timezone
    arrivalTimeFormat = arrivalTime.strftime('%H:%M') #format time
    depTimeFormat = depRealTime.strftime('%H:%M')

    acc.append({
        "flightID": flightID,
        "departureTime": depTimeFormat,
        "arrivalTime": arrivalTimeFormat
    })

    return acc

Flights = rdd['flightID'].tolist()
distributed_data_flights = sparkContext.parallelize(Flights, 10)

dist_data_flight_id = distributed_data_flights.map(get_instances)
dist_data_flight_id_count = dist_data_flight_id.reduce(ReduceCounter)

dist_data_flight_id_list = list(dist_data_flight_id_count.keys())

#Getting data about each flight
FirstFlightInfo = reduce(GetFlightInfo, [record for _, record in rdd.iterrows()], {})
flightIDs = [record['flightID'] for record in FirstFlightInfo.values()]
originAirports = [record['originAirport'] for record in FirstFlightInfo.values()]
departureTimes = [record['departureTime'] for record in FirstFlightInfo.values()]
destinationAirports = [record['destinationAirport'] for record in FirstFlightInfo.values()]
flightTimes = [record['flightTime'] for record in FirstFlightInfo.values()]
FlightInfo = pd.DataFrame({
    'flightID': flightIDs,
    'originAirport': originAirports,
    'departureTime': departureTimes,
    'destinationAirport': destinationAirports,
    'flightTime': flightTimes
})

#get the flightID and number of Passengers 
FlightData = pd.DataFrame({
    'flightID': list(dist_data_flight_id_count.keys()),
    'passengerCount': list(dist_data_flight_id_count.values())
})

df = pd.merge(FlightInfo, FlightData, on='flightID', how='inner')

FlightInfoTuples = FlightInfo[['flightID', 'originAirport', 'departureTime', 'destinationAirport', 'flightTime']].itertuples(index=False) #Convert to tuples so can use reduce

CalcTime = reduce(CalculateTimes, FlightInfoTuples, []) #use a reduce with tuples input and the function CalculateTimes
CalcTimeFI = [flight['flightID'] for flight in CalcTime]
CalcTimeDT = [flight['departureTime'] for flight in CalcTime]
CalcTimeAT = [flight['arrivalTime'] for flight in CalcTime]

CalcArrTime = pd.DataFrame({
    'flightID': CalcTimeFI,
    'departureTime': CalcTimeDT,
    'arrivalTime': CalcTimeAT
})

df = pd.merge(df, CalcArrTime, on='flightID', how='inner', suffixes=('_orig', '_new'))
df['departureTime_orig'] = df['departureTime_new']
df = df.drop(columns=['departureTime_new'])
df = df.rename(columns={"departureTime_orig": "departureTime"})


                                                                                