In [20]:
import pandas as pd
from geopy.distance import geodesic

# 1. Carregamento dos arquivos
dfRide = pd.read_csv("ride_v2.csv", sep=";", dtype=str)
dfRideAdd = pd.read_csv("rideaddress_v1.csv", sep=";", dtype=str)
dfRideEst = pd.read_csv("rideestimative_v3.csv", sep=";", dtype=str)
dfProduct = pd.read_csv("product.csv", sep=";", dtype=str)

# 2. Uniformização: datas e RideID
dfRide["Schedule"] = pd.to_datetime(dfRide["Schedule"], errors="coerce")
for df in [dfRide, dfRideAdd, dfRideEst]:
    df["RideID"] = df["RideID"].astype(str).str.replace(".0", "", regex=False)

# 3. Derivar colunas de tempo
dfRide["Dia"] = dfRide["Schedule"].dt.weekday
dfRide["Hora"] = dfRide["Schedule"].dt.hour
dfRide["Minuto"] = dfRide["Schedule"].dt.minute
dfRide["HoraDecimal"] = dfRide["Hora"] + dfRide["Minuto"] / 60
dfRide["Faixa15min"] = dfRide["Schedule"].dt.floor("15min")

dfTempo = dfRide[["RideID", "Dia", "Hora", "Minuto", "HoraDecimal", "Faixa15min"]].dropna()

# 4. Extrair origem e destino (Lat, Lng, Address)
dfRideAdd = dfRideAdd.rename(columns={"RideAddressTypeID": "OrigDest"})
dfOrigem = dfRideAdd[dfRideAdd["OrigDest"] == "1"][["RideID", "Lat", "Lng", "Address"]].rename(
    columns={"Lat": "Lat1", "Lng": "Lng1", "Address": "AddressOrig"}
)
dfDestino = dfRideAdd[dfRideAdd["OrigDest"] == "2"][["RideID", "Lat", "Lng", "Address"]].rename(
    columns={"Lat": "Lat2", "Lng": "Lng2", "Address": "AddressDest"}
)

dfCoords = pd.merge(dfOrigem, dfDestino, on="RideID", how="inner")

# Corrige vírgulas e converte coordenadas
for col in ["Lat1", "Lng1", "Lat2", "Lng2"]:
    dfCoords[col] = dfCoords[col].str.replace(",", ".").astype(float).round(6)

# 5. Integrar estimativas com produtos
dfRideEst["ProductID"] = dfRideEst["ProductID"].astype(str)
dfProduct["ProductID"] = dfProduct["ProductID"].astype(str)

dfEstimadaComProduto = pd.merge(dfRideEst, dfProduct, on="ProductID", how="left")
dfEstimadaSelecionada = dfEstimadaComProduto[dfEstimadaComProduto["Selected"] == "1"][[
    "RideID", "ProductID", "Description", "Price", "WaitingTime",
    "ProviderID", "CategoryID", "Fee"
]]

# 6. Refiltra pelos RideID em comum
dfCoords["RideID"] = dfCoords["RideID"].astype(str)
dfEstimadaSelecionada["RideID"] = dfEstimadaSelecionada["RideID"].astype(str)

ids_comuns = set(dfTempo["RideID"]) & set(dfCoords["RideID"]) & set(dfEstimadaSelecionada["RideID"])

dfTempo = dfTempo[dfTempo["RideID"].isin(ids_comuns)].sort_values("RideID").reset_index(drop=True)
dfCoords = dfCoords[dfCoords["RideID"].isin(ids_comuns)].sort_values("RideID").reset_index(drop=True)
dfEstimadaSelecionada = dfEstimadaSelecionada[dfEstimadaSelecionada["RideID"].isin(ids_comuns)].sort_values("RideID").reset_index(drop=True)

# 7. Junta tudo sem merge
dfDerivado = pd.concat([
    dfTempo,
    dfCoords.drop(columns=["RideID"]),
    dfEstimadaSelecionada.drop(columns=["RideID"])
], axis=1)

# 8. Remove NaNs nas coordenadas
dfDerivado = dfDerivado.dropna(subset=["Lat1", "Lng1", "Lat2", "Lng2"]).reset_index(drop=True)

# 9. Cálculo da distância
dfDerivado["Distancia_km"] = dfDerivado.apply(
    lambda row: geodesic((row["Lat1"], row["Lng1"]), (row["Lat2"], row["Lng2"])).kilometers,
    axis=1
)

# 10. Visualização
dfDerivado.head()

Unnamed: 0,RideID,Dia,Hora,Minuto,HoraDecimal,Faixa15min,Lat1,Lng1,AddressOrig,Lat2,Lng2,AddressDest,ProductID,Description,Price,WaitingTime,ProviderID,CategoryID,Fee,Distancia_km
0,1183200,1.0,10.0,9.0,10.15,2021-08-17 10:00:00,-26.329754,-48.840428,"Rua João Pinheiro, 585 - Rua João Pinheiro - B...",-26.255466,-48.64342,"Av. Dr. Nereu Ramos, 450 - Rocio Grande, São F...",regular-taxi,Táxi Comum,151.05,6,3,5,0.0,21.327034
1,1183201,1.0,10.0,9.0,10.15,2021-08-17 10:00:00,-27.491979,-48.528288,"Rodovia Rafael da Rocha Pires, 1883 - Rodovia ...",-27.437149,-48.398243,Angeloni Ingleses (Florianópolis) - Supermerca...,UberX,UberX,31.5,9,2,2,0.0,14.217724
2,1183202,1.0,10.0,10.0,10.166667,2021-08-17 10:00:00,-19.84958,-44.019916,"Rua Barão do Rio Branco, 12 - Rua Barão do Rio...",-19.936899,-43.94016,"R. Antônio de Albuquerque, 1080 - Funcionários...",UberX,UberX,42.0,3,2,2,0.0,12.77474
3,1183203,1.0,10.0,10.0,10.166667,2021-08-17 10:00:00,-23.962423,-46.254658,"Tv. Duzentos e Sessenta e Um, 72, 72",-23.837307,-46.132172,"Semar Supermercados Bertioga, 2141",UberX,UberX,47.5,3,2,2,0.0,18.644013
4,1183204,1.0,10.0,10.0,10.166667,2021-08-17 10:00:00,-10.919802,-37.077442,"Rua Argentina, 160 - Rua Argentina - Brasil",-10.907129,-37.087719,"R. Simeão Aguiar, 430 - Novo Paraíso, Aracaju ...",UberX,UberX,7.5,4,2,2,0.0,1.796461


In [21]:
# sqlite3 já vem embutido no Python, então não é necessário instalar manualmente.
import sqlite3

In [22]:
# Criação do banco e conexão
conn = sqlite3.connect("PRECIFICA.DB")
cursor = conn.cursor()

In [23]:
# Criação da tabela
strSQL=dfDerivado.to_sql('DADOSUBER', conn, if_exists='replace', index=False)

# Commit e fechamento da conexão
conn.commit()
conn.close()

In [24]:
conn = sqlite3.connect("PRECIFICA.DB")
df_verificacao = pd.read_sql_query("SELECT * FROM DADOSUBER LIMIT 5", conn)
print(df_verificacao)
conn.close()

    RideID  Dia  Hora  Minuto  HoraDecimal           Faixa15min       Lat1  \
0  1183200  1.0  10.0     9.0    10.150000  2021-08-17 10:00:00 -26.329754   
1  1183201  1.0  10.0     9.0    10.150000  2021-08-17 10:00:00 -27.491979   
2  1183202  1.0  10.0    10.0    10.166667  2021-08-17 10:00:00 -19.849580   
3  1183203  1.0  10.0    10.0    10.166667  2021-08-17 10:00:00 -23.962423   
4  1183204  1.0  10.0    10.0    10.166667  2021-08-17 10:00:00 -10.919802   

        Lng1                                        AddressOrig       Lat2  \
0 -48.840428  Rua João Pinheiro, 585 - Rua João Pinheiro - B... -26.255466   
1 -48.528288  Rodovia Rafael da Rocha Pires, 1883 - Rodovia ... -27.437149   
2 -44.019916  Rua Barão do Rio Branco, 12 - Rua Barão do Rio... -19.936899   
3 -46.254658               Tv. Duzentos e Sessenta e Um, 72, 72 -23.837307   
4 -37.077442        Rua Argentina, 160 - Rua Argentina - Brasil -10.907129   

        Lng2                                        AddressDes

In [25]:
conn = sqlite3.connect("PRECIFICA.DB")
df_verificacao = pd.read_sql_query("SELECT * FROM DADOSUBER", conn)
print(df_verificacao)
conn.close()

         RideID  Dia  Hora  Minuto  HoraDecimal           Faixa15min  \
0       1183200  1.0  10.0     9.0    10.150000  2021-08-17 10:00:00   
1       1183201  1.0  10.0     9.0    10.150000  2021-08-17 10:00:00   
2       1183202  1.0  10.0    10.0    10.166667  2021-08-17 10:00:00   
3       1183203  1.0  10.0    10.0    10.166667  2021-08-17 10:00:00   
4       1183204  1.0  10.0    10.0    10.166667  2021-08-17 10:00:00   
...         ...  ...   ...     ...          ...                  ...   
234014  1425238  1.0  20.0    52.0    20.866667  2022-06-14 20:45:00   
234015  1425239  1.0  20.0    52.0    20.866667  2022-06-14 20:45:00   
234016  1425240  1.0  20.0    52.0    20.866667  2022-06-14 20:45:00   
234017  1425241  1.0  20.0    53.0    20.883333  2022-06-14 20:45:00   
234018  1425243  1.0  20.0    53.0    20.883333  2022-06-14 20:45:00   

             Lat1       Lng1  \
0      -26.329754 -48.840428   
1      -27.491979 -48.528288   
2      -19.849580 -44.019916   
3      

In [26]:
!pip install pycryptodome geopy



In [27]:
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
import base64
import hashlib

In [28]:
def gerar_chave(senha):
    return hashlib.sha256(senha.encode()).digest()

def adicionar_padding(texto):
    bytes_txt = texto.encode()
    padding = AES.block_size - len(bytes_txt) % AES.block_size
    return bytes_txt + bytes([padding] * padding)

def remover_padding(dados):
    padding = dados[-1]
    return dados[:-padding]

def criptografar(texto, senha):
    chave = gerar_chave(senha)
    iv = get_random_bytes(AES.block_size)
    cifra = AES.new(chave, AES.MODE_CBC, iv)
    texto_padded = adicionar_padding(texto)
    criptografado = cifra.encrypt(texto_padded)
    return base64.b64encode(iv + criptografado).decode('utf-8')

def descriptografar(criptografado_base64, senha):
    chave = gerar_chave(senha)
    dados = base64.b64decode(criptografado_base64)
    iv = dados[:AES.block_size]
    cifra = AES.new(chave, AES.MODE_CBC, iv)
    texto_padded = cifra.decrypt(dados[AES.block_size:])
    return remover_padding(texto_padded).decode('utf-8')

In [29]:
dfDerivado = pd.DataFrame({
    "RideID": ["1", "2", "3"],
    "Dia": [1, 2, 3],
    "Hora": [14, 15, 16],
    "Minuto": [30, 45, 0],
    "HoraDecimal": [14.5, 15.75, 16.0],
    "Faixa15min": pd.to_datetime(["2023-01-01 14:30", "2023-01-01 15:45", "2023-01-01 16:00"]),
    "Lat1": [-23.5505, -23.5510, -23.5520],
    "Lng1": [-46.6333, -46.6340, -46.6350],
    "AddressOrig": ["Rua A, 100", "Av. B, 200", "Travessa C, 300"],
    "Lat2": [-23.5595, -23.5600, -23.5610],
    "Lng2": [-46.6253, -46.6260, -46.6270],
    "AddressDest": ["Rua X, 900", "Av. Y, 800", "Travessa Z, 700"],
    "ProductID": ["101", "102", "103"],
    "Description": ["UberX", "Uber Comfort", "Uber Black"],
    "Price": ["20.50", "30.00", "45.00"],
    "WaitingTime": ["5", "6", "7"],
    "ProviderID": ["1", "1", "1"],
    "CategoryID": ["A", "B", "C"],
    "Fee": ["2.50", "3.00", "4.00"]
})

dfDerivado["Distancia_km"] = dfDerivado.apply(
    lambda row: geodesic((row["Lat1"], row["Lng1"]), (row["Lat2"], row["Lng2"])).kilometers,
    axis=1
)


In [30]:
# Criptografar colunas sensíveis
senha = "chaveSegura123"
dfDerivado["AddressOrig"] = dfDerivado["AddressOrig"].apply(lambda x: criptografar(x, senha))
dfDerivado["AddressDest"] = dfDerivado["AddressDest"].apply(lambda x: criptografar(x, senha))

# Salvar no banco
conn = sqlite3.connect("PRECIFICA.DB")
dfDerivado.to_sql("DADOSUBER_CRYPT", conn, if_exists="replace", index=False)
conn.commit()
conn.close()

In [31]:
conn = sqlite3.connect("PRECIFICA.DB")
df_lido = pd.read_sql_query("SELECT * FROM DADOSUBER_CRYPT", conn)
conn.close()

df_lido["AddressOrig"] = df_lido["AddressOrig"].apply(lambda x: descriptografar(x, senha))
df_lido["AddressDest"] = df_lido["AddressDest"].apply(lambda x: descriptografar(x, senha))

df_lido.head()

Unnamed: 0,RideID,Dia,Hora,Minuto,HoraDecimal,Faixa15min,Lat1,Lng1,AddressOrig,Lat2,Lng2,AddressDest,ProductID,Description,Price,WaitingTime,ProviderID,CategoryID,Fee,Distancia_km
0,1,1,14,30,14.5,2023-01-01 14:30:00,-23.5505,-46.6333,"Rua A, 100",-23.5595,-46.6253,"Rua X, 900",101,UberX,20.5,5,1,A,2.5,1.288677
1,2,2,15,45,15.75,2023-01-01 15:45:00,-23.551,-46.634,"Av. B, 200",-23.56,-46.626,"Av. Y, 800",102,Uber Comfort,30.0,6,1,B,3.0,1.288675
2,3,3,16,0,16.0,2023-01-01 16:00:00,-23.552,-46.635,"Travessa C, 300",-23.561,-46.627,"Travessa Z, 700",103,Uber Black,45.0,7,1,C,4.0,1.288671
