# 1 - IMPORTS

In [250]:
import pandas as pd
import plotly.express as px
from haversine import haversine
import folium

# 2 - Data

In [251]:
data = pd.read_csv('data/uber.csv')
df = data.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [252]:
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [253]:
# Corrida não finalizada, sem passageiros
data[data['dropoff_latitude'].isna()]

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
87946,32736015,2013-07-02 03:51:57.0000001,24.1,2013-07-02 03:51:57 UTC,-73.950581,40.779692,,,0


In [254]:
# A coluna 'key' pode ser desconsiderada, contém os mesmos dados de 'pickup_datetime', e já temos uma coluna com o id das corridas
data['Unnamed: 0'].nunique(), data['key'].nunique()

(200000, 200000)

# 3 - Data Transform

In [255]:
# Drop da coluna 'key'
df.drop(columns='key', axis=1, inplace=True)
# Drop da corrida não finalizada
df.dropna(inplace=True)

# Conversão da coluna da data para o formato datetime e criação das colunas de ano, mês e dia,
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
df['day'] = df['pickup_datetime'].apply(lambda x: x.day)

# Renomeando as colunas 'Unnamed: 0 ' e 'fare_amount'
new_cols = ['id',
 'fare',
 'pickup_date',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count',
 'year',
 'month',
 'day']

df.columns = new_cols

In [256]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199999 entries, 0 to 199999
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   id                 199999 non-null  int64              
 1   fare               199999 non-null  float64            
 2   pickup_date        199999 non-null  datetime64[ns, UTC]
 3   pickup_longitude   199999 non-null  float64            
 4   pickup_latitude    199999 non-null  float64            
 5   dropoff_longitude  199999 non-null  float64            
 6   dropoff_latitude   199999 non-null  float64            
 7   passenger_count    199999 non-null  int64              
 8   year               199999 non-null  int64              
 9   month              199999 non-null  int64              
 10  day                199999 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(5)
memory usage: 18.3 MB


# 4 - Explore

## 4.1 - Passengers

In [257]:
df['passenger_count'].value_counts()

passenger_count
1      138425
2       29428
5       14009
3        8881
4        4276
6        4271
0         708
208         1
Name: count, dtype: int64

708 corridas não tiveram passageiros, 1 corrida teve 208 passageiros

In [258]:
# Corrida com 208 passageiros
df[df['passenger_count'] == 208]

Unnamed: 0,id,fare,pickup_date,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day
113038,35893772,11.7,2010-12-28 08:20:00+00:00,-73.937795,40.758498,-73.937835,40.758415,208,2010,12,28


In [259]:
# Stats geral das corridas com zero passageiros
zero_passengers = df[df['passenger_count'] == 0]
zero_passengers.describe()

Unnamed: 0,id,fare,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day
count,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0
mean,26759770.0,9.439266,-71.887276,39.600657,-71.989742,39.658641,0.0,2011.355932,5.899718,15.524011
std,15962550.0,6.73169,12.26538,6.756683,11.963165,6.590469,0.0,0.642996,3.554363,8.86761
min,13714.0,2.5,-74.0173,0.0,-74.016154,0.0,0.0,2009.0,1.0,1.0
25%,13253890.0,5.3,-73.992202,40.734526,-73.991022,40.733205,0.0,2011.0,3.0,8.0
50%,25996710.0,7.3,-73.98095,40.75472,-73.97965,40.7532,0.0,2011.0,5.0,15.0
75%,40536390.0,11.3,-73.965799,40.767425,-73.962868,40.7698,0.0,2012.0,9.0,23.0
max,55316570.0,57.3,0.0,40.8334,0.0,40.864822,0.0,2015.0,12.0,31.0


## 4.2 - Coordinates

In [None]:
# Localizando colunas com coordenadas (latitude, longitude) iguais à zero
zero_coordenates = df.apply(lambda x: True if any([x['pickup_latitude'] == 0, x['pickup_longitude'] == 0, x['dropoff_latitude'] == 0, x['dropoff_longitude'] == 0]) else False, axis=1)
print(f'Existem {sum(zero_coordenates)} corridas com coordenadas zeradas')

# Eliminando as colunas com coordenadas (latitude, longitude) iguais à zero
df1 = df.copy().reset_index() # criando um novo dataframe para permanecer o estado anterior a estas alterações
df1 = df[~zero_coordenates]

Existem 3968 corridas com coordenadas zeradas


De acordo com uma fonte confiável da Wikipédia (módulo de mapa, que define os limites para fins cartográficos):

Limintes da cidade de NEW YORK

Latitude máxima (topo): aproximadamente 40.92° N

Latitude mínima (base): aproximadamente 40.49° N

Longitude mínima (lado esquerdo): aproximadamente –74.27° (mais a oeste)

Longitude máxima (lado direito): aproximadamente –73.68° (mais a leste) 

In [None]:
# Aplicando os filtros de limites das coordenadas 
mask = (
    (df1['pickup_longitude']  >= -74.27) & (df1['pickup_longitude']  <= -73.68) &
    (df1['dropoff_longitude'] >= -74.27) & (df1['dropoff_longitude'] <= -73.68) &
    (df1['pickup_latitude']   >=  40.49) & (df1['pickup_latitude']   <=  40.92) &
    (df1['dropoff_latitude']  >=  40.49) & (df1['dropoff_latitude']  <=  40.92)
)
df1 = df1[mask]


In [None]:
# Distancia das corridas
coord_cols = ['pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude']

df1['distance'] = df1.loc[:,coord_cols].apply(lambda x: haversine( (x['pickup_latitude'], x['pickup_longitude']), 
                                                                         (x['dropoff_latitude'], x['dropoff_longitude']) ),axis=1)
df1 = df1[df1['distance'] > 0]

In [283]:
px.scatter(df1, x='distance', y='fare')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Plotando amostras das posições geográficas de partida e destino das corridas
df_test = df1.sample(1500)

def get_coordinates(df):
    coord_list = []
    for i in zip(df['pickup_latitude'], df['pickup_longitude']):
        coord_list.append(i)
    for i in zip(df['dropoff_latitude'], df['dropoff_longitude']):
        coord_list.append(i)
    
    return coord_list

coordenadas = get_coordinates(df_test)
mapa = folium.Map(location=[df1['pickup_latitude'].mean(), df1['pickup_longitude'].mean()], zoom_start=5)
for i in coordenadas:
    folium.Marker(location=i).add_to(mapa)
mapa

In [292]:
# Corrida mais distante
df1[df1['distance'] == df1['distance'].max()]

Unnamed: 0,id,fare,pickup_date,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,distance
195343,24456114,81.81,2010-10-06 22:15:00+00:00,-73.970932,40.757446,-74.2368,40.511859,1,2010,10,6,35.341909


In [291]:

# Posição de partida e destino da corrida mais distante
mapa = folium.Map(location=[df1['pickup_latitude'].mean(), df1['pickup_longitude'].mean()], zoom_start=10)
folium.Marker(location=[40.511859, -74.2368], popup='dropoff').add_to(mapa)
folium.Marker(location=[40.757446, -73.970932], popup='pickup').add_to(mapa)
mapa

In [282]:
df1[df1['fare']<0].apply(lambda x: x['fare'] * -1, axis=1)

63395      5.00
71246      3.30
79903      3.50
89322     49.57
92063     23.70
98875     52.00
104080     7.30
139272     6.90
148803     5.70
150301     3.00
151681    10.90
157412     3.50
164056    50.50
179111     3.50
180444     3.00
190925     5.50
dtype: float64