## Script de Importación de Trafico de NYC a MySQL

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import mysql.connector
from datetime import datetime

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Obtener las variables de entorno
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_name = os.getenv('DB_NAME')
db_port = os.getenv('DB_PORT')
csv_file_path = 'Automated_Traffic_Volume_Counts.csv'  # Asegúrate de actualizar esto con tu archivo CSV correcto

# Verificar si las variables de entorno se cargaron correctamente
if not all([db_host, db_user, db_password, db_name, db_port]):
    raise ValueError("No se pudieron cargar todas las variables de entorno. Por favor verifica el archivo .env.")

# Cargar el archivo CSV en un DataFrame de pandas
try:
    df = pd.read_csv(csv_file_path)
except FileNotFoundError as e:
    raise FileNotFoundError(f"El archivo {csv_file_path} no se encuentra. Por favor verifica la ruta.")

# Filtrar los datos desde enero 2023 hasta agosto 2024
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 8, 31)

# Ensure the columns are in the correct format
df['Yr'] = df['Yr'].astype(int)
df['M'] = df['M'].astype(int)
df['D'] = df['D'].astype(int)
df['HH'] = df['HH'].astype(int)
df['MM'] = df['MM'].astype(int)

# Create a datetime column
df['datetime'] = pd.to_datetime(df[['Yr', 'M', 'D', 'HH', 'MM']].rename(columns={'Yr': 'year', 'M': 'month', 'D': 'day', 'HH': 'hour', 'MM': 'minute'}))

df_filtered = df[(df['datetime'] >= start_date) & (df['datetime'] <= end_date)].copy()

# Seleccionar solo las columnas necesarias para el análisis
columns_to_keep = ['RequestID', 'Boro', 'Yr', 'M', 'D', 'HH', 'MM', 'Vol', 'SegmentID', 'WktGeom', 'street', 'Direction']
df_filtered = df_filtered[columns_to_keep]

# Conectar a la base de datos MySQL
try:
    db_connection = mysql.connector.connect(
        host=db_host,
        user=db_user,
        password=db_password,
        database=db_name,
        port=int(db_port)
    )
except mysql.connector.Error as err:
    raise mysql.connector.Error(f"Error al conectar a la base de datos: {err}")

cursor = db_connection.cursor()

# Insertar los datos filtrados del DataFrame en bloques de 100,000 filas
chunk_size = 100000
rows_inserted = 0
for start in range(0, len(df_filtered), chunk_size):
    end = start + chunk_size
    chunk = df_filtered.iloc[start:end]

    insert_query = """
    INSERT INTO trafico (RequestID, Boro, Yr, M, D, HH, MM, Vol, SegmentID, WktGeom, Direction)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    data = [(
        row['RequestID'], row['Boro'], row['Yr'], row['M'], row['D'], row['HH'], row['MM'], row['Vol'],
        row['SegmentID'], row['WktGeom'], row['Direction']
    ) for _, row in chunk.iterrows()]

    cursor.executemany(insert_query, data)
    rows_inserted += len(data)
    
    # Imprimir el progreso solo cada 10 bloques
    if start // chunk_size % 10 == 0:
        print(f"Se han insertado {len(data)} filas en este bloque. Total filas insertadas: {rows_inserted}")

# Confirmar los cambios y cerrar la conexión
db_connection.commit()
cursor.close()
db_connection.close()

# Mostrar el número total de filas insertadas
print(f"Se han insertado un total de {rows_inserted} filas en la base de datos.")


## Script de Importación de Temperaturas Promedio de NYC a MySQL


In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import mysql.connector

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Obtener las variables de entorno
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_name = os.getenv('DB_NAME')
db_port = os.getenv('DB_PORT')
csv_file_path = 'temperaturas_promedio_nyc_mensual.csv'  # Asegúrate de actualizar esto con tu archivo CSV correcto

# Verificar si las variables de entorno se cargaron correctamente
if not all([db_host, db_user, db_password, db_name, db_port]):
    raise ValueError("No se pudieron cargar todas las variables de entorno. Por favor verifica el archivo .env.")

# Cargar el archivo CSV en un DataFrame de pandas
try:
    df = pd.read_csv(csv_file_path)
except FileNotFoundError as e:
    raise FileNotFoundError(f"El archivo {csv_file_path} no se encuentra. Por favor verifica la ruta.")

# Conectar a la base de datos MySQL
try:
    db_connection = mysql.connector.connect(
        host=db_host,
        user=db_user,
        password=db_password,
        database=db_name,
        port=int(db_port)
    )
except mysql.connector.Error as err:
    raise mysql.connector.Error(f"Error al conectar a la base de datos: {err}")

cursor = db_connection.cursor()

# Insertar los datos del DataFrame en la tabla MySQL
insert_query = """
INSERT INTO temperaturas (Mes, Manhattan, Brooklyn, Queens, The_Bronx, Staten_Island)
VALUES (%s, %s, %s, %s, %s, %s)
"""
data = [tuple(row) for row in df.values]

cursor.executemany(insert_query, data)

# Confirmar los cambios y cerrar la conexión
db_connection.commit()
cursor.close()
db_connection.close()

# Mostrar el número de filas insertadas
print(f"Se han insertado {len(data)} filas en la base de datos.")


## Script de Importación de Taxi_Zones de NYC a MySQL


In [None]:
import pandas as pd
import os
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
from shapely import wkt

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Configuración de la conexión a la base de datos con credenciales desde variables de entorno
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

def create_and_populate_taxi_zones_table(csv_file_path):
    try:
        # Leer el archivo CSV en un DataFrame de pandas
        df = pd.read_csv(csv_file_path)

        # Convertir la columna 'the_geom' a objetos geométricos utilizando shapely
        df['geometry'] = df['the_geom'].apply(lambda x: wkt.loads(x) if pd.notnull(x) else None)
        
        # Eliminar las columnas adicionales que no están en la tabla 'taxi_zones'
        df = df[['LocationID', 'borough', 'zone', 'the_geom']]

        # Eliminar filas duplicadas basadas en 'LocationID'
        df = df.drop_duplicates(subset='LocationID')

        # Crear el motor de SQLAlchemy para las operaciones de base de datos
        engine = create_engine(
            f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@"
            f"{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
        )

        # Definir el esquema de la tabla utilizando text
        create_table_query = text("""
        CREATE TABLE IF NOT EXISTS taxi_zones (
            LocationID INT PRIMARY KEY,
            Borough VARCHAR(50) NOT NULL,
            Zone VARCHAR(100) NOT NULL,
            the_geom TEXT NOT NULL
        );
        """)

        # Ejecutar la creación de la tabla usando el método correcto
        with engine.connect() as connection:
            connection.execute(create_table_query)

        # Guardar los datos transformados en la base de datos
        df.to_sql('taxi_zones', engine, if_exists='append', index=False)

        print("🚀 Tabla taxi_zones creada y datos insertados exitosamente")

    except Exception as e:
        print(f"❌ Error durante la creación o inserción de datos en la tabla taxi_zones: {e}")
        import traceback
        traceback.print_exc()

# Ruta al archivo CSV
csv_file_path = 'taxi_zones.csv'

# Ejecutar la función para crear la tabla y poblarla con datos
if __name__ == "__main__":
    create_and_populate_taxi_zones_table(csv_file_path)


## Consultas en MYSQL

In [6]:
import mysql.connector
from dotenv import load_dotenv
import os
import logging

# 📝 Configuración del logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_query.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    """🗄️ Administrador de conexiones y consultas a la base de datos"""
    
    def __init__(self):
        # 🔑 Cargar variables de entorno
        load_dotenv()
        
        # 🔒 Obtener credenciales de la base de datos
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """🔌 Crear una conexión a la base de datos"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            logger.info("🟢 Conexión establecida exitosamente")
            return conn
        except mysql.connector.Error as err:
            logger.error(f"🔴 Error de conexión: {err}")
            raise

    def count_records(self):
        """📊 Ejecutar un COUNT(1) en la tabla taxi_fhv_data"""
        conn = None
        cursor = None
        try:
            conn = self._create_connection()
            cursor = conn.cursor()
            
            # 🔍 Ejecutar la consulta COUNT(1)
            query = "SELECT COUNT(1) FROM taxi_fhv_data"
            cursor.execute(query)
            count = cursor.fetchone()[0]
            logger.info(f"📈 Total de registros: {count}")
            print(f"📈 Total de registros: {count}")
            return count
            
        except Exception as e:
            logger.error(f"❌ Error en la consulta: {e}")
            print(f"❌ Error en la consulta: {e}")
            raise
            
        finally:
            # 🧹 Limpieza de recursos
            if cursor:
                cursor.close()
            if conn:
                conn.close()
                logger.info("🔌 Conexión cerrada")
            
            # ✅ Registro de finalización
            print("✅ Proceso de consulta completado")
            logger.info("✅ Proceso de consulta completado")

def main():
    """🎯 Función principal de ejecución"""
    db_manager = DatabaseManager()
    
    try:
        record_count = db_manager.count_records()
        print(f"🎉 Conteo exitoso: {record_count} registros")
    except Exception as e:
        print(f"❌ El proceso falló: {e}")

if __name__ == "__main__":
    main()

📈 Total de registros: 1472569
✅ Proceso de consulta completado
🎉 Conteo exitoso: 1472569 registros


## Nueva tabla para machine learning

In [8]:
import pandas as pd
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import time

# 🌟 Cargar variables de entorno
load_dotenv()

# 🔐 Configuración de conexión a la base de datos
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

def fetch_and_process_batch(offset, batch_size):
    try:
        engine = create_engine(
            f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
        )
        
        query = f"""
        SELECT 
            DATE(t.Pickup_datetime) AS pickup_date,
            t.PULocationID,
            z.Borough AS pickup_borough,
            DAY(t.Pickup_datetime) AS pickup_day,
            HOUR(t.Pickup_datetime) AS pickup_hour,
            COUNT(*) AS trip_count
        FROM 
            taxi_fhv_data t
        JOIN 
            taxi_zones z ON t.PULocationID = z.LocationID
        WHERE 
            t.Pickup_datetime >= DATE_SUB(CURDATE(), INTERVAL 44 MONTH)
        GROUP BY
            pickup_date, t.PULocationID, pickup_borough, pickup_day, pickup_hour
        LIMIT {batch_size} OFFSET {offset}
        """
        
        df = pd.read_sql(query, engine)
        
        df.to_sql('enriched_taxi_data',
                  engine,
                  if_exists='append',
                  index=False,
                  chunksize=25000)
        
        return len(df)

    except Exception as e:
        print(f"❌ Error en el procesamiento del lote: {e}")
        return 0

def populate_enriched_taxi_data_table(batch_size=25000):
    try:
        start_time = time.time()
        engine = create_engine(
            f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
        )
        with engine.connect() as connection:
            count_query = """
            SELECT COUNT(*) as total_rows 
            FROM taxi_fhv_data 
            WHERE Pickup_datetime >= DATE_SUB(CURDATE(), INTERVAL 44 MONTH)
            """
            total_rows = pd.read_sql(count_query, connection)['total_rows'][0]

        offsets = range(0, total_rows, batch_size)
        processed_rows = 0
        
        for offset in offsets:
            batch_start_time = time.time()
            processed = fetch_and_process_batch(offset, batch_size)
            processed_rows += processed

            # Calcular tiempo de procesamiento del lote y estimar el tiempo restante
            batch_time = time.time() - batch_start_time
            remaining_batches = (total_rows - processed_rows) // batch_size
            estimated_remaining = batch_time * remaining_batches

            print(f"🚦 Processed batch: {processed_rows}/{total_rows} rows "
                  f"({processed_rows/total_rows*100:.2f}%) "
                  f"| Last batch time: {batch_time:.2f}s "
                  f"| Est. remaining: {estimated_remaining/60:.2f} mins")
        
        total_time = time.time() - start_time
        print(f"✅ Data enrichment completed successfully")
        print(f"⏱️ Total processing time: {total_time/60:.2f} minutes")
        print(f"📊 Total rows processed: {processed_rows}")

    except Exception as e:
        print(f"❌ Error during data enrichment: {e}")
        import traceback
        traceback.print_exc()

# 🏁 Ejecutar el proceso de enriquecimiento
if __name__ == "__main__":
    populate_enriched_taxi_data_table()


🚦 Processed batch: 25000/1472569 rows (1.70%) | Last batch time: 24.54s | Est. remaining: 23.32 mins
🚦 Processed batch: 50000/1472569 rows (3.40%) | Last batch time: 26.81s | Est. remaining: 25.02 mins
🚦 Processed batch: 75000/1472569 rows (5.09%) | Last batch time: 25.36s | Est. remaining: 23.25 mins
🚦 Processed batch: 100000/1472569 rows (6.79%) | Last batch time: 25.96s | Est. remaining: 23.36 mins
🚦 Processed batch: 125000/1472569 rows (8.49%) | Last batch time: 28.73s | Est. remaining: 25.37 mins
🚦 Processed batch: 150000/1472569 rows (10.19%) | Last batch time: 30.63s | Est. remaining: 26.54 mins
🚦 Processed batch: 175000/1472569 rows (11.88%) | Last batch time: 28.11s | Est. remaining: 23.89 mins
🚦 Processed batch: 200000/1472569 rows (13.58%) | Last batch time: 26.59s | Est. remaining: 22.16 mins
🚦 Processed batch: 225000/1472569 rows (15.28%) | Last batch time: 26.39s | Est. remaining: 21.56 mins
🚦 Processed batch: 250000/1472569 rows (16.98%) | Last batch time: 25.51s | Est. 

In [14]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from dotenv import load_dotenv
import os

# 🌟 Cargar variables de entorno
load_dotenv()

# 🔐 Configuración de la conexión a la base de datos
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

# Crear el motor de SQLAlchemy
engine = create_engine(
    f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

# Leer los datos desde la tabla enriched_taxi_data
query = "SELECT * FROM enriched_taxi_data"
df = pd.read_sql(query, engine)

# Seleccionar las características y la variable objetivo
X = df[['PULocationID', 'pickup_borough', 'pickup_day', 'pickup_hour']]
y = df['trip_count']

# Convertir variable categórica 'pickup_borough' en variables dummy
X = pd.get_dummies(X, columns=['pickup_borough'], drop_first=True)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Realizar predicciones sobre el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el rendimiento del modelo
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")

# Función para predecir la demanda dada una zona, día y hora
def predict_demand(pulocationid, pickup_day, pickup_hour, pickup_borough):
    # Crear un DataFrame con las mismas columnas que el DataFrame original utilizado para entrenar el modelo
    input_data = pd.DataFrame({
        'PULocationID': [pulocationid],
        'pickup_day': [pickup_day],
        'pickup_hour': [pickup_hour]
    })

    # Añadir columnas dummy para 'pickup_borough'
    borough_columns = [col for col in X.columns if col.startswith('pickup_borough_')]
    for col in borough_columns:
        input_data[col] = 0
    
    # Establecer el valor adecuado de la columna dummy correspondiente al 'pickup_borough'
    if f'pickup_borough_{pickup_borough}' in input_data.columns:
        input_data[f'pickup_borough_{pickup_borough}'] = 1
    
    prediction = model.predict(input_data)
    return prediction[0]

# Ejemplo de uso de la función predict_demand
predicted_demand = predict_demand(11, 11, 13, 'Manhattan')
print(f"Predicted Demand: {predicted_demand:.2f}")


MAE: 3.46
R²: 0.14
Predicted Demand: 9.20


In [15]:
import pandas as pd
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
import time

# 🌟 Cargar variables de entorno
load_dotenv()

# 🔐 Configuración de conexión a la base de datos
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

def fetch_and_process_batch(offset, batch_size):
    try:
        engine = create_engine(
            f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
        )
        
        query = f"""
        SELECT 
            DATE(t.Pickup_datetime) AS pickup_date,
            t.PULocationID,
            DAYOFWEEK(t.Pickup_datetime) AS pickup_weekday,
            HOUR(t.Pickup_datetime) AS pickup_hour,
            COUNT(*) AS trip_count
        FROM 
            taxi_fhv_data t
        JOIN 
            taxi_zones z ON t.PULocationID = z.LocationID
        WHERE 
            t.Pickup_datetime >= DATE_SUB(CURDATE(), INTERVAL 44 MONTH)
        GROUP BY
            pickup_date, t.PULocationID, pickup_weekday, pickup_hour
        LIMIT {batch_size} OFFSET {offset}
        """
        
        df = pd.read_sql(query, engine)
        
        df.to_sql('enriched_taxi_data2',
                  engine,
                  if_exists='append',
                  index=False,
                  chunksize=25000)
        
        return len(df)

    except Exception as e:
        print(f"❌ Error en el procesamiento del lote: {e}")
        return 0

def populate_enriched_taxi_data_table(batch_size=25000):
    try:
        start_time = time.time()
        engine = create_engine(
            f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
        )
        with engine.connect() as connection:
            count_query = """
            SELECT COUNT(*) as total_rows 
            FROM taxi_fhv_data 
            WHERE Pickup_datetime >= DATE_SUB(CURDATE(), INTERVAL 44 MONTH)
            """
            total_rows = pd.read_sql(count_query, connection)['total_rows'][0]

        offsets = range(0, total_rows, batch_size)
        processed_rows = 0
        
        for offset in offsets:
            batch_start_time = time.time()
            processed = fetch_and_process_batch(offset, batch_size)
            processed_rows += processed

            # Calcular tiempo de procesamiento del lote y estimar el tiempo restante
            batch_time = time.time() - batch_start_time
            remaining_batches = (total_rows - processed_rows) // batch_size
            estimated_remaining = batch_time * remaining_batches

            print(f"🚦 Processed batch: {processed_rows}/{total_rows} rows "
                  f"({processed_rows/total_rows*100:.2f}%) "
                  f"| Last batch time: {batch_time:.2f}s "
                  f"| Est. remaining: {estimated_remaining/60:.2f} mins")
        
        total_time = time.time() - start_time
        print(f"✅ Data enrichment completed successfully")
        print(f"⏱️ Total processing time: {total_time/60:.2f} minutes")
        print(f"📊 Total rows processed: {processed_rows}")

    except Exception as e:
        print(f"❌ Error during data enrichment: {e}")
        import traceback
        traceback.print_exc()

# 🏁 Ejecutar el proceso de enriquecimiento
if __name__ == "__main__":
    populate_enriched_taxi_data_table()


🚦 Processed batch: 25000/1472569 rows (1.70%) | Last batch time: 24.33s | Est. remaining: 23.11 mins
🚦 Processed batch: 50000/1472569 rows (3.40%) | Last batch time: 20.45s | Est. remaining: 19.09 mins
🚦 Processed batch: 75000/1472569 rows (5.09%) | Last batch time: 14.40s | Est. remaining: 13.20 mins
🚦 Processed batch: 100000/1472569 rows (6.79%) | Last batch time: 16.22s | Est. remaining: 14.59 mins
🚦 Processed batch: 125000/1472569 rows (8.49%) | Last batch time: 19.26s | Est. remaining: 17.01 mins
🚦 Processed batch: 150000/1472569 rows (10.19%) | Last batch time: 16.21s | Est. remaining: 14.05 mins
🚦 Processed batch: 175000/1472569 rows (11.88%) | Last batch time: 18.15s | Est. remaining: 15.43 mins
🚦 Processed batch: 200000/1472569 rows (13.58%) | Last batch time: 18.38s | Est. remaining: 15.32 mins
🚦 Processed batch: 225000/1472569 rows (15.28%) | Last batch time: 18.26s | Est. remaining: 14.91 mins
🚦 Processed batch: 250000/1472569 rows (16.98%) | Last batch time: 16.89s | Est. 

In [14]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from dotenv import load_dotenv
import os

# 🌟 Cargar variables de entorno
load_dotenv()

# 🔐 Configuración de la conexión a la base de datos
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

# Crear el motor de SQLAlchemy
engine = create_engine(
    f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

# Leer los datos desde la tabla enriched_taxi_data2
query = "SELECT * FROM enriched_taxi_data2"
df = pd.read_sql(query, engine)

# Seleccionar las características y la variable objetivo
X = df[['PULocationID', 'pickup_weekday', 'pickup_hour']]
y = df['trip_count']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Realizar predicciones sobre el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el rendimiento del modelo
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")

# Función para predecir la demanda dada una zona, día y hora
def predict_demand(pulocationid, pickup_weekday, pickup_hour):
    # Crear un DataFrame con las mismas columnas que el DataFrame original utilizado para entrenar el modelo
    input_data = pd.DataFrame({
        'PULocationID': [pulocationid],
        'pickup_weekday': [pickup_weekday],
        'pickup_hour': [pickup_hour]
    })
    
    prediction = model.predict(input_data)
    return prediction[0]

# Ejemplo de uso de la función predict_demand
predicted_demand = predict_demand(84, 3, 12)  # 84 es el PULocationID, 3 es miércoles, 13 es la hora
print(f"Predicted Demand: {predicted_demand:.2f}")

predicted_demand1 = predict_demand(99, 3, 12)  # 99 es el PULocationID, 3 es miércoles, 13 es la hora
print(f"Predicted Demand: {predicted_demand1:.2f}")

predicted_demand2 = predict_demand(204, 3, 12)  # 204 es el PULocationID, 3 es miércoles, 13 es la hora
print(f"Predicted Demand: {predicted_demand2:.2f}")


MAE: 3.67
R²: 0.03
Predicted Demand: 4.63
Predicted Demand: 4.40
Predicted Demand: 2.75


In [10]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from dotenv import load_dotenv
import os
import ast

# 🌟 Cargar variables de entorno
load_dotenv()

# 🔐 Configuración de la conexión a la base de datos
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

# Crear el motor de SQLAlchemy
engine = create_engine(
    f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

# Leer los datos desde la tabla enriched_taxi_data2
query = "SELECT * FROM enriched_taxi_data2"
df = pd.read_sql(query, engine)

# Leer el archivo de zonas adyacentes
adjacent_zones_df = pd.read_csv('adjacent_zones.csv')
adjacent_zones_df['adjacent_zones'] = adjacent_zones_df['adjacent_zones'].apply(ast.literal_eval)

# Función para obtener las zonas adyacentes
def get_adjacent_zones(location_id):
    row = adjacent_zones_df.loc[adjacent_zones_df['LocationID'] == location_id]
    if not row.empty:
        return row.iloc[0]['adjacent_zones']
    else:
        return []

# Seleccionar las características y la variable objetivo
X = df[['PULocationID', 'pickup_weekday', 'pickup_hour']]
y = df['trip_count']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Realizar predicciones sobre el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el rendimiento del modelo
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")

# Función para predecir la demanda en una zona incluyendo las zonas adyacentes
def predict_demand_with_adjacent(pulocationid, pickup_weekday, pickup_hour):
    # Obtener las zonas adyacentes
    adjacent_zones = get_adjacent_zones(pulocationid)
    all_zones = adjacent_zones + [pulocationid]
    
    # Preparar los datos de entrada para todas las zonas
    input_data = pd.DataFrame({
        'PULocationID': all_zones,
        'pickup_weekday': [pickup_weekday] * len(all_zones),
        'pickup_hour': [pickup_hour] * len(all_zones)
    })
    
    # Realizar predicciones para todas las zonas y devolver los resultados individuales
    predictions = model.predict(input_data)
    result = {zone: prediction for zone, prediction in zip(all_zones, predictions)}
    
    return result

# Ejemplo de uso de la función predict_demand_with_adjacent
predicted_demands = predict_demand_with_adjacent(26, 3, 19)  # 260 es el PULocationID, 3 es miércoles, 13 es la hora

# Encontrar la zona con mayor demanda
max_zone = max(predicted_demands, key=predicted_demands.get)
print(f"Zone with highest demand: {max_zone}, Demand: {predicted_demands[max_zone]:.2f}")


MAE: 3.67
R²: 0.03
Zone with highest demand: 21, Demand: 6.25


## Crear Archivo de Machine Learning PLK

In [11]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from dotenv import load_dotenv
import os
import ast
import joblib  # Importar joblib para guardar y cargar el modelo

# 🌟 Cargar variables de entorno
load_dotenv()

# 🔐 Configuración de la conexión a la base de datos
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

# Crear el motor de SQLAlchemy
engine = create_engine(
    f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

# Leer los datos desde la tabla enriched_taxi_data2
query = "SELECT * FROM enriched_taxi_data2"
df = pd.read_sql(query, engine)

# Leer el archivo de zonas adyacentes
adjacent_zones_df = pd.read_csv('adjacent_zones.csv')
adjacent_zones_df['adjacent_zones'] = adjacent_zones_df['adjacent_zones'].apply(ast.literal_eval)

# Función para obtener las zonas adyacentes
def get_adjacent_zones(location_id):
    row = adjacent_zones_df.loc[adjacent_zones_df['LocationID'] == location_id]
    if not row.empty:
        return row.iloc[0]['adjacent_zones']
    else:
        return []

# Seleccionar las características y la variable objetivo
X = df[['PULocationID', 'pickup_weekday', 'pickup_hour']]
y = df['trip_count']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Guardar el modelo entrenado en un archivo
joblib.dump(model, 'linear_regression_model.pkl')

# Evaluar el rendimiento del modelo
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")


MAE: 3.67
R²: 0.03


## Prueba modelo 2

In [None]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from dotenv import load_dotenv
import os

# 🌟 Cargar variables de entorno
load_dotenv()

# 🔐 Configuración de la conexión a la base de datos
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

# Crear el motor de SQLAlchemy
engine = create_engine(
    f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

# Leer los datos desde la tabla taxi_fhv_data
query = "SELECT * FROM taxi_fhv_data"
df = pd.read_sql(query, engine)

# Extraer información temporal
df['pickup_weekday'] = pd.to_datetime(df['Pickup_datetime']).dt.dayofweek + 1  # De 1 (lunes) a 7 (domingo)
df['pickup_hour'] = pd.to_datetime(df['Pickup_datetime']).dt.hour

# Seleccionar las características y la variable objetivo
X = df[['PULocationID', 'pickup_weekday', 'pickup_hour']]
y = df['trip_count']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Realizar predicciones sobre el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el rendimiento del modelo
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")

# Función para predecir la demanda dada una zona, día y hora
def predict_demand(pulocationid, pickup_weekday, pickup_hour):
    # Crear un DataFrame con las mismas columnas que el DataFrame original utilizado para entrenar el modelo
    input_data = pd.DataFrame({
        'PULocationID': [pulocationid],
        'pickup_weekday': [pickup_weekday],
        'pickup_hour': [pickup_hour]
    })
    
    prediction = model.predict(input_data)
    return prediction[0]

# Ejemplo de uso de la función predict_demand
predicted_demand = predict_demand(84, 3, 12)  # 84 es el PULocationID, 3 es miércoles, 12 es la hora
print(f"Predicted Demand: {predicted_demand:.2f}")

predicted_demand1 = predict_demand(99, 3, 12)  # 99 es el PULocationID, 3 es miércoles, 12 es la hora
print(f"Predicted Demand: {predicted_demand1:.2f}")

predicted_demand2 = predict_demand(204, 3, 12)  # 204 es el PULocationID, 3 es miércoles, 12 es la hora
print(f"Predicted Demand: {predicted_demand2:.2f}")
