In [3]:
import pandas as pd
from geopy import distance, geocoders
import numpy as np

###  Import du dataset

In [38]:
df = pd.read_csv("data/taxi_fare_extract.csv")
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-06-21 10:33:57.0000001,10.0,2015-06-21 10:33:57+00:00,-73.966209,40.761841,-73.991287,40.74501,1
1,2010-01-13 08:13:14.0000007,10.9,2010-01-13 08:13:14+00:00,-73.932603,40.763805,-73.932603,40.763805,1
2,2011-10-08 10:38:10.0000002,14.9,2011-10-08 10:38:10+00:00,-74.008679,40.711449,-73.990906,40.742073,1
3,2010-11-09 16:09:00.00000015,5.7,2010-11-09 16:09:00+00:00,-73.975663,40.791653,-73.982267,40.774968,1
4,2013-09-22 21:27:47.0000001,11.0,2013-09-22 21:27:47+00:00,-73.985324,40.744291,-73.993366,40.719451,1


### Ajout d'une colonne 'distance' qui calcule entre le drop_on et le drop_off

In [39]:
df['distance(km)'] = df.apply(lambda row:
                          distance.distance(
                              (row['pickup_latitude'], row['pickup_longitude']),
                              (row['dropoff_latitude'], row['dropoff_longitude'])).km,
                          axis = 1)

df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance(km)
0,2015-06-21 10:33:57.0000001,10.0,2015-06-21 10:33:57+00:00,-73.966209,40.761841,-73.991287,40.74501,1,2.82456
1,2010-01-13 08:13:14.0000007,10.9,2010-01-13 08:13:14+00:00,-73.932603,40.763805,-73.932603,40.763805,1,0.0
2,2011-10-08 10:38:10.0000002,14.9,2011-10-08 10:38:10+00:00,-74.008679,40.711449,-73.990906,40.742073,1,3.717473
3,2010-11-09 16:09:00.00000015,5.7,2010-11-09 16:09:00+00:00,-73.975663,40.791653,-73.982267,40.774968,1,1.934904
4,2013-09-22 21:27:47.0000001,11.0,2013-09-22 21:27:47+00:00,-73.985324,40.744291,-73.993366,40.719451,1,2.840874


### Ajout des infos DateTime

In [48]:
df['pickup_weekday'] = df['pickup_datetime'].dt.day_name()
df['pickup_month'] = df['pickup_datetime'].dt.month_name()

df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance(km),pickup_weekday,pickup_month
0,2015-06-21 10:33:57.0000001,10.0,2015-06-21 10:33:57+00:00,-73.966209,40.761841,-73.991287,40.74501,1,2.82456,Sunday,June
1,2010-01-13 08:13:14.0000007,10.9,2010-01-13 08:13:14+00:00,-73.932603,40.763805,-73.932603,40.763805,1,0.0,Wednesday,January
2,2011-10-08 10:38:10.0000002,14.9,2011-10-08 10:38:10+00:00,-74.008679,40.711449,-73.990906,40.742073,1,3.717473,Saturday,October
3,2010-11-09 16:09:00.00000015,5.7,2010-11-09 16:09:00+00:00,-73.975663,40.791653,-73.982267,40.774968,1,1.934904,Tuesday,November
4,2013-09-22 21:27:47.0000001,11.0,2013-09-22 21:27:47+00:00,-73.985324,40.744291,-73.993366,40.719451,1,2.840874,Sunday,September


### Détermination des districts de NY en fonction de la pickup_location
##### Création d'une liste des districts avec leurs centre en coordonnées lat, long

In [58]:
districts = {
    "Bronx": (40.8517687,-73.9109737),
    "Queens": (40.8517727,-73.9131624),
    "Staten_Island": (40.5647149,-74.2168582),
    "Brooklin": (40.6453531,-74.0150372),
    "Manhattan": (40.7591704,-74.0392707)
}

##### Création d'une fonction de détermination du district pour un imput donnée, en fonction de la liste ci-dessus

In [59]:
def get_district(lat, long):
    distances = {
        key: distance.distance((lat, long), value).km
        for key, value in districts.items()
    }
    return sorted(distances, key = lambda dict_key: distances[dict_key])[0]

##### Détermination du district en fonction du pickup_location

In [62]:
df['pickup_district'] = df.apply(lambda row: get_district(row['pickup_latitude'], 
                                                          row['pickup_longitude']), 
                                 axis = 1)
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance(km),pickup_weekday,pickup_month,pickup_district
0,2015-06-21 10:33:57.0000001,10.0,2015-06-21 10:33:57+00:00,-73.966209,40.761841,-73.991287,40.74501,1,2.82456,Sunday,June,Manhattan
1,2010-01-13 08:13:14.0000007,10.9,2010-01-13 08:13:14+00:00,-73.932603,40.763805,-73.932603,40.763805,1,0.0,Wednesday,January,Manhattan
2,2011-10-08 10:38:10.0000002,14.9,2011-10-08 10:38:10+00:00,-74.008679,40.711449,-73.990906,40.742073,1,3.717473,Saturday,October,Manhattan
3,2010-11-09 16:09:00.00000015,5.7,2010-11-09 16:09:00+00:00,-73.975663,40.791653,-73.982267,40.774968,1,1.934904,Tuesday,November,Manhattan
4,2013-09-22 21:27:47.0000001,11.0,2013-09-22 21:27:47+00:00,-73.985324,40.744291,-73.993366,40.719451,1,2.840874,Sunday,September,Manhattan
