# Data Challenge to find nearest Airport.

To build an efficient script that finds the closest airport to a given user based on their geolocation and the geolocation of the airport.

Some important libraries, to calculate geo distance we use geopy library which uses Vicenty distance formula by default and Great Circle distance. For paralellization, Pool library is imported.

In [15]:
#Import necessary modules

import gzip
import csv
import pandas as pd
from collections import defaultdict
import geopy, geopy.distance
from multiprocessing import Pool
import shutil
import os

In [47]:
## load the airport locations and store it into hash 

file_location = gzip.open("data/optd-sample-20161201.csv.gz", "rt")
data_location = defaultdict(list)
with file_location as data_file:
    reader = csv.DictReader(data_file) 
    for row in reader:
        data_location[row['iata_code']].append(float(row['latitude']))
        data_location[row['iata_code']].append(float(row['longitude']))
file_location.close()

## For visualising airport data
df = pd.DataFrame.from_dict(data_location, orient='index')
df.columns = ['Latitude', 'Longitude']
df.head()

Unnamed: 0,Latitude,Longitude
AER,43.449928,39.956589
SJM,18.83333,-71.23333
ACV,40.97164,-124.10709
TIE,7.333,35.58
LIL,50.57037,3.10643


In [57]:
## load User coordinates

file_user = gzip.open("data/sample_data.csv.gz", "rt")
with file_user as f:
    reader = csv.reader(f)
    next(reader) # Skips header
    user_list = list(reader)
file_user.close()

## For visualising user
user_df = pd.DataFrame(user_list, columns=["User_uuid", "geoip_latitude", "geoip_longitude"])
user_df.head()

Unnamed: 0,User_uuid,geoip_latitude,geoip_longitude
0,DDEFEBEA-98ED-49EB-A4E7-9D7BFDB7AA0B,-37.83330154418945,145.0500030517578
1,DAEF2221-14BE-467B-894A-F101CDCC38E4,52.51670074462891,4.666699886322021
2,31971B3E-2F80-4F8D-86BA-1F2077DF36A2,35.68500137329102,139.7514038085938
3,1A29A45C-D560-43D8-ADAB-C2F0AD068FFE,44.84040069580078,-0.5805000066757202
4,A6EC281B-B8EC-465A-8933-F127472DB0A3,51.96329879760742,4.49970006942749


In [32]:
## Function for Geodistance calculations of User coordinates with each airport coordinate and return the nearest location to the user.

def geodistance_calculation(user):
    ## Converting hash to a geo-coordinate list with geopy 
    pts = []
    for p in data_location:
        coor = data_location.get(p)
        pts.append(geopy.Point(coor[0],coor[1]))

    #print("User Info : ", user[0], "\tCoordinates : " , user[1], user[2])
    onept = geopy.Point(user[1],user[2])
    
    ## Calculating geodistance between user coordinate and all the airports coordinate   
    try:
        alldist = [ (p,geopy.distance.distance(p, onept).km) for p in pts ]
    except Exception:
        alldist = [ (p,geopy.distance.great_circle(p, onept).km) for p in pts ]

    nearest_point, distance = min(alldist, key=lambda x: (x[1]))[0] , min(alldist, key=lambda x: (x[1]))[1] # minimal distance
    #print(" Nearest Distance : ", distance ,"\tCoordinates : ", nearest_point[0],", ",nearest_point[1] )
    
    return (user[0],list(data_location.keys())[list(data_location.values()).index([nearest_point[0],nearest_point[1]])])
    


In [33]:
# Main function
if __name__ == "__main__":
    user_list = user_list[1:10]  # Change size of user list to view the results
    
    ## To make it work in paralellize form we have used pool library, so as it can multiprocess, here it uses 4 processes    
    with Pool(4) as pro:
        result_list = pro.map(geodistance_calculation,  user_list, chunksize=10)
        
result_df = pd.DataFrame.from_records(result_list,columns=["User_uuid", "IATA_code"])
result_df.head()

Unnamed: 0,User_uuid,IATA_code
0,DDEFEBEA-98ED-49EB-A4E7-9D7BFDB7AA0B,MBW
1,DAEF2221-14BE-467B-894A-F101CDCC38E4,AMS
2,31971B3E-2F80-4F8D-86BA-1F2077DF36A2,HND
3,1A29A45C-D560-43D8-ADAB-C2F0AD068FFE,BOD
4,A6EC281B-B8EC-465A-8933-F127472DB0A3,RTM


In [None]:
## Writing the list of uuid and its corresponding nearest iata_code to a csv file
with open('output/result_file.csv', 'w') as outcsv:   
    writer = csv.writer(outcsv, delimiter=',', quoting=csv.QUOTE_ALL, lineterminator='\n') #configure writer to write standard csv file
    writer.writerow(['User_uuid', 'IATA_code'])
    for value in result_list:
        #Write values to outcsv
        writer.writerow([value[0], value[1]])

## Compressing output file to gzip format        
with open('output/result_file.csv', 'rb') as f_in, gzip.open('output/result_file.csv.gz', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

## Destroying the output file
os.remove('output/result_file.csv')