Author
======

Ali Yasin Akalin

20492

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## Custom Functions

In [None]:
def init():
    from pathlib import Path
    import os

    # DIR_SRC points to project directory which is CS210_Project_1/src
    # and can be reached from definitions.DIR_SRC after
    # running definitions.init() just once
    
    global DIR_SRC
    #DIR_SRC = Path(__file__).parent
    DIR_SRC = Path.cwd().parent / "src"

    global DIR_DATA
    #DIR_DATA = DIR_SRC / 'data'
    DIR_DATA = DIR_SRC / 'data'

    # Changes active directory to project directory
    #os.chdir(DIR_SRC)

### Basic Functions
Open file, save file as csv and print reports.

In [None]:
def openDataFile(file_name):
    init()
    data_frame = pd.read_csv(DIR_DATA / file_name)
    return data_frame


def saveDataFrame(data_frame,
                  file_name,
                  verbose=False):
    init()
    data_frame.to_csv(DIR_DATA / file_name)
    if verbose:
        print("File saved into", str(DIR_DATA))


def printInfo(info, description='', info_name=''):
    print("============================================")
    if description:
        print(description)
    print(info)

In [None]:
def showInfo(data_frame):
    printInfo(data_frame.shape, "Shape of the data (rows, columns)")
    printInfo(data_frame.head(), "First 5 rows of the data)")
    printInfo(data_frame.describe(), "Statistical analysis")
    printInfo(data_frame.dtypes, "Data type of columns")


def addColumn(data_frame, column_name, default_value=np.nan):
    data_frame[column_name] = default_value


def dealNanValues(data_frame):
    printInfo(data_frame.isnull().sum())

### Create coordinate tuples
Example: `(latitude, longitude)`

#### Reverse Geocoder API

In [None]:
import reverse_geocoder as rg


def makeCoordinateTuple(latitute, longitude):
    if type(latitute) is str or type(latitute) is str:
        latitute = float(latitute)
        longitude = float(longitude)
    return (latitute, longitude)


def getDistrictResult(*args):
    return rg.search(args)


def getDistrictName(result):
    names = []
    for query in result:
        names.append(query['name'])
    return names



In [None]:
def getCoords(lat_col, lng_col):
    coordinates = []
    for lat, lng in zip(lat_col, lng_col):
        coordinates.append(makeCoordinateTuple(lat, lng))
    return coordinates



### Get districts and fill them into data frame

In [None]:
def getDistricts(coordinate_list, verbose=False):
    districts = []
    if verbose:
        print("There are", len(coordinate_list), "coordinates to work on.")
        i = 0
    for instance in coordinate_list:
        result = getDistrictResult(instance)
        district = getDistrictName(result)[0]
        districts.append(district)
        if verbose:
            i += 1
            if i % 500 == 0:
                print('500 more rows are done.')
                if i == len(districts):
                    print("In total:", len(districts))
    if verbose:
        print(len(districts), "districts are taken")
    return districts


def fillDistrict(data_frame, district_col, districts_list, verbose=False):
    data_frame[district_col] = districts_list
    if verbose:
        print(len(districts_list), "districts are filled")
    return data_frame


### Print popular districts

In [None]:
def printPopularDistrict(data_frame, district_column_name, rows=5):
    d = district_column_name
    printInfo(data_frame[d].value_counts().head(rows),
              'Most popular '+str(rows)+' districts.')



### Get Distances and fill them into data frame

In [None]:
import geopy.distance as dist


def measureDist(*args, method='geodesic'):
    gc = ['great_circle', 'Great Circle', 'Great circle', 'GREAT_CIRCLE']
    gd = ['geodesic', 'Geodesic', 'GEODESIC', 'Geo-desic']

    coord1, coord2 = args[:2]
    if method in gd:
        return float(dist.geodesic(coord1, coord2).kilometers)
    elif method in gc:
        return float(dist.great_circle(coord1, coord2).kilometers)


def getDistances(coordinate_list_1, coordinate_list_2, verbose=False):
    distances = []
    for c1, c2 in zip(coordinate_list_1, coordinate_list_2):
        distances.append(measureDist(c1, c2))
    return distances


def fillDistances(data_frame, distance_col, distance_list, verbose=False):
    data_frame[distance_col] = distance_list
    if verbose:
        print("Distances are filled")
    return data_frame



### Day-Time Operations

In [None]:
def getDayTime(pickup_times):
    time_of_day = []
    time_dict = {7: 'rush hour morning',
                 8: 'rush hour morning',
                 9: 'afternoon',
                 10: 'afternoon',
                 11: 'afternoon',
                 12: 'afternoon',
                 13: 'afternoon',
                 14: 'afternoon',
                 15: 'afternoon',
                 16: 'rush hour evening',
                 17: 'rush hour evening',
                 18: 'evening',
                 19: 'evening',
                 20: 'evening',
                 21: 'evening',
                 22: 'evening',
                 23: 'late night',
                 0: 'late night',
                 1: 'late night',
                 2: 'late night',
                 3: 'late night',
                 4: 'late night',
                 5: 'late night',
                 6: 'late night'}
    for time in pickup_times:
        time = time.split()[1]
        time = time.split(':', maxsplit=1)[0]
        time = int(time)
        time_of_day.append(time_dict[time])
    return time_of_day

In [None]:
# Seaborn parameters
sns.set(style="darkgrid")

# Matplotlib settings
%matplotlib inline

## Execution

In [None]:
init()
df = openDataFile("taxi-trips.csv")

showInfo(df)

addColumn(df, "pickup_district")
addColumn(df, "dropoff_district")

pickup_coords = getCoords(df.pickup_latitude, df.pickup_longitude)
drop_coords = getCoords(df.dropoff_latitude, df.dropoff_longitude)

dl = getDistricts(pickup_coords, verbose=True)
fillDistrict(df, "pickup_district", dl)

dl = getDistricts(drop_coords, verbose=True)
fillDistrict(df, "dropoff_district", dl)

''' Save Updated data frame
saveDataFrame(df, 'taxi-trips_2'.csv)
'''

for d in ['pickup_district', 'dropoff_district']:
    printPopularDistrict(df, d)

dl = getDistances(pickup_coords, drop_coords)
df['distance'] = dl

times = getDayTime(df.pickup_datetime)
df['time_of_day'] = times