# 1 - IMPORTS

In [371]:
import pandas as pd
import plotly.express as px
from haversine import haversine
import folium

# 2 - Data

In [372]:
data = pd.read_csv('data/uber.csv')
df = data.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [373]:
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [374]:
# Corrida não finalizada, sem passageiros
data[data['dropoff_latitude'].isna()]

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
87946,32736015,2013-07-02 03:51:57.0000001,24.1,2013-07-02 03:51:57 UTC,-73.950581,40.779692,,,0


In [375]:
# A coluna 'key' pode ser desconsiderada, contém os mesmos dados de 'pickup_datetime', e já temos uma coluna com o id das corridas
data['Unnamed: 0'].nunique(), data['key'].nunique()

(200000, 200000)

# 3 - Data Transform

In [376]:
# Drop da coluna 'key'
df.drop(columns='key', axis=1, inplace=True)
# Drop da corrida não finalizada
df.dropna(inplace=True)

# Conversão da coluna da data para o formato datetime e criação das colunas de ano, mês e dia,
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
df['day'] = df['pickup_datetime'].apply(lambda x: x.day)

# Renomeando as colunas 'Unnamed: 0 ' e 'fare_amount'
new_cols = ['id',
 'fare',
 'pickup_date',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count',
 'year',
 'month',
 'day']

df.columns = new_cols

# Eliminando taxas menores que zero
df = df[df['fare'] > 0]

# Convertendo a data para TimeStamp
df['pickup_timestamp'] = df['pickup_date'].dt.strftime('%Y-%m-%d')


In [377]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199977 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   id                 199977 non-null  int64              
 1   fare               199977 non-null  float64            
 2   pickup_date        199977 non-null  datetime64[ns, UTC]
 3   pickup_longitude   199977 non-null  float64            
 4   pickup_latitude    199977 non-null  float64            
 5   dropoff_longitude  199977 non-null  float64            
 6   dropoff_latitude   199977 non-null  float64            
 7   passenger_count    199977 non-null  int64              
 8   year               199977 non-null  int64              
 9   month              199977 non-null  int64              
 10  day                199977 non-null  int64              
 11  pickup_timestamp   199977 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(

# 4 - Explore

## 4.1 - Passengers

In [378]:
df['passenger_count'].value_counts()

passenger_count
1      138414
2       29424
5       14005
3        8878
4        4276
6        4271
0         708
208         1
Name: count, dtype: int64

708 corridas não tiveram passageiros, 1 corrida teve 208 passageiros

In [379]:
# Corrida com 208 passageiros
df[df['passenger_count'] == 208]

Unnamed: 0,id,fare,pickup_date,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,pickup_timestamp
113038,35893772,11.7,2010-12-28 08:20:00+00:00,-73.937795,40.758498,-73.937835,40.758415,208,2010,12,28,2010-12-28


In [380]:
# Stats geral das corridas com zero passageiros
zero_passengers = df[df['passenger_count'] == 0]
zero_passengers.describe()

Unnamed: 0,id,fare,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day
count,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0,708.0
mean,26759770.0,9.439266,-71.887276,39.600657,-71.989742,39.658641,0.0,2011.355932,5.899718,15.524011
std,15962550.0,6.73169,12.26538,6.756683,11.963165,6.590469,0.0,0.642996,3.554363,8.86761
min,13714.0,2.5,-74.0173,0.0,-74.016154,0.0,0.0,2009.0,1.0,1.0
25%,13253890.0,5.3,-73.992202,40.734526,-73.991022,40.733205,0.0,2011.0,3.0,8.0
50%,25996710.0,7.3,-73.98095,40.75472,-73.97965,40.7532,0.0,2011.0,5.0,15.0
75%,40536390.0,11.3,-73.965799,40.767425,-73.962868,40.7698,0.0,2012.0,9.0,23.0
max,55316570.0,57.3,0.0,40.8334,0.0,40.864822,0.0,2015.0,12.0,31.0


## 4.2 - Coordinates

In [381]:
# Localizando colunas com coordenadas (latitude, longitude) iguais à zero
zero_coordenates = df.apply(lambda x: True if any([x['pickup_latitude'] == 0, x['pickup_longitude'] == 0, x['dropoff_latitude'] == 0, x['dropoff_longitude'] == 0]) else False, axis=1)
print(f'Existem {sum(zero_coordenates)} corridas com coordenadas zeradas')

# Eliminando as colunas com coordenadas (latitude, longitude) iguais à zero
df1 = df.copy().reset_index() # criando um novo dataframe para permanecer o estado anterior a estas alterações
df1 = df[~zero_coordenates]

Existem 3965 corridas com coordenadas zeradas


De acordo com uma fonte confiável da Wikipédia (módulo de mapa, que define os limites para fins cartográficos):

Limintes da cidade de NEW YORK

Latitude máxima (topo): aproximadamente 40.92° N

Latitude mínima (base): aproximadamente 40.49° N

Longitude mínima (lado esquerdo): aproximadamente –74.27° (mais a oeste)

Longitude máxima (lado direito): aproximadamente –73.68° (mais a leste) 

In [382]:
# Aplicando os filtros de limites das coordenadas 
mask = (
    (df1['pickup_longitude']  >= -74.27) & (df1['pickup_longitude']  <= -73.68) &
    (df1['dropoff_longitude'] >= -74.27) & (df1['dropoff_longitude'] <= -73.68) &
    (df1['pickup_latitude']   >=  40.49) & (df1['pickup_latitude']   <=  40.92) &
    (df1['dropoff_latitude']  >=  40.49) & (df1['dropoff_latitude']  <=  40.92)
)
df1 = df1[mask]


In [383]:
# Distancia das corridas
coord_cols = ['pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude']

df1['distance'] = df1.loc[:,coord_cols].apply(lambda x: haversine( (x['pickup_latitude'], x['pickup_longitude']), 
                                                                         (x['dropoff_latitude'], x['dropoff_longitude']) ),axis=1)

# Eliminando distancias iguais a zero
df1 = df1[df1['distance'] > 0.5]

In [384]:
px.scatter(df1, x='distance', y='fare')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [385]:
# Plotando amostras das posições geográficas de partida e destino das corridas
df_test = df1.sample(1500)

def get_coordinates(df):
    coord_list = []
    for i in zip(df['pickup_latitude'], df['pickup_longitude']):
        coord_list.append(i)
    for i in zip(df['dropoff_latitude'], df['dropoff_longitude']):
        coord_list.append(i)
    
    return coord_list

coordenadas = get_coordinates(df_test)
mapa = folium.Map(location=[df1['pickup_latitude'].mean(), df1['pickup_longitude'].mean()], zoom_start=5)
for i in coordenadas:
    folium.Marker(location=i).add_to(mapa)
# mapa

In [386]:
# Corrida mais distante
df1[df1['distance'] == df1['distance'].max()]

Unnamed: 0,id,fare,pickup_date,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,pickup_timestamp,distance
195343,24456114,81.81,2010-10-06 22:15:00+00:00,-73.970932,40.757446,-74.2368,40.511859,1,2010,10,6,2010-10-06,35.341909


In [387]:

# Posição de partida e destino da corrida mais distante
mapa = folium.Map(location=[df1['pickup_latitude'].mean(), df1['pickup_longitude'].mean()], zoom_start=10)
folium.Marker(location=[40.511859, -74.2368], popup='dropoff').add_to(mapa)
folium.Marker(location=[40.757446, -73.970932], popup='pickup').add_to(mapa)

#mapa

<folium.map.Marker at 0x7502d22400b0>

# 5 - OPEN METEO API

In [388]:
import openmeteo_requests

import pandas as pd
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

In [389]:
print(type(cache_session))
print(cache_session)

<class 'requests_cache.session.CachedSession'>
<CachedSession(cache=<SQLiteCache(name=.cache)>, settings=CacheSettings(expire_after=3600))>


In [390]:
# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"
params = {
	"latitude": 40.757446,
	"longitude": -73.970932,
	"daily": "weather_code",
	"hourly": ["temperature_2m", "weather_code", "rain"],
	"timezone": "America/New_York",
	"forecast_days": 1,
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone: {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(1).ValuesAsNumpy()
hourly_rain = hourly.Variables(2).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["weather_code"] = hourly_weather_code
hourly_data["rain"] = hourly_rain

hourly_dataframe = pd.DataFrame(data = hourly_data)
print("\nHourly data\n", hourly_dataframe)

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["weather_code"] = daily_weather_code

daily_dataframe = pd.DataFrame(data = daily_data)
print("\nDaily data\n", daily_dataframe)

Coordinates: 40.76250457763672°N -73.97511291503906°E
Elevation: 60.0 m asl
Timezone: b'America/New_York'b'GMT-4'
Timezone difference to GMT+0: -14400s

Hourly data
                         date  temperature_2m  weather_code  rain
0  2025-08-10 04:00:00+00:00       19.796499           0.0   0.0
1  2025-08-10 05:00:00+00:00       19.296499           0.0   0.0
2  2025-08-10 06:00:00+00:00       18.546499           0.0   0.0
3  2025-08-10 07:00:00+00:00       18.696501           0.0   0.0
4  2025-08-10 08:00:00+00:00       18.046499           0.0   0.0
5  2025-08-10 09:00:00+00:00       16.846500           0.0   0.0
6  2025-08-10 10:00:00+00:00       16.996500           0.0   0.0
7  2025-08-10 11:00:00+00:00       17.746500           0.0   0.0
8  2025-08-10 12:00:00+00:00       21.646500           0.0   0.0
9  2025-08-10 13:00:00+00:00       24.146500           0.0   0.0
10 2025-08-10 14:00:00+00:00       26.796499           0.0   0.0
11 2025-08-10 15:00:00+00:00       29.346500          

In [391]:
df1.columns

Index(['id', 'fare', 'pickup_date', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'month', 'day', 'pickup_timestamp', 'distance'],
      dtype='object')

In [397]:
latitude,longitude,date = df1[['pickup_latitude','pickup_longitude','pickup_timestamp']].iloc[0,:]
latitude,longitude,date

(np.float64(40.73835372924805), np.float64(-73.99981689453125), '2015-05-07')

In [398]:
date

'2015-05-07'

In [None]:
from open_meteo_api import Weather

day_test = Weather(latitude,longitude)

day_test.forecast(date,date)


Unnamed: 0,data,clima,temperatura media
0,2015-05-07 03:00:00+00:00,Mainly Sunny,17.016499
