## Eliminar fechas fuera del rango necesario en MYSQL

Se utiliza un codigo ejecutado local o en colab para limpieza de Taxis_fhv_data 

In [None]:
import mysql.connector
from dotenv import load_dotenv
import os
import time
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        # Load environment variables
        load_dotenv()

        # Get database credentials from environment variables
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """Create a database connection"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            return conn
        except mysql.connector.Error as err:
            logger.error(f"Connection error: {err}")
            raise

    def delete_records_outside_date_range(self, start_date, end_date, batch_size=10000):
        """
        Delete records outside specified date range with retry and batching
        
        :param start_date: Start date for keeping records
        :param end_date: End date for keeping records
        :param batch_size: Number of records to delete in each batch
        """
        conn = None
        cursor = None
        total_deleted = 0

        try:
            conn = self._create_connection()
            conn.autocommit = False  # Disable autocommit
            cursor = conn.cursor()
            
            # Print and log start of deletion process
            print("Starting deletion process...")
            logger.info("Starting deletion process...")

            # Validate date format
            try:
                datetime.strptime(start_date, '%Y-%m-%d')
                datetime.strptime(end_date, '%Y-%m-%d')
            except ValueError:
                logger.error("Invalid date format. Use YYYY-MM-DD")
                print("Invalid date format. Use YYYY-MM-DD")
                return 0

            # Initial check to estimate total records to delete
            count_query = f"""
            SELECT COUNT(*) FROM taxi_fhv_data 
            WHERE NOT (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
            """
            cursor.execute(count_query)
            total_records = cursor.fetchone()[0]
            logger.info(f"Total records to delete: {total_records}")
            print(f"Total records to delete: {total_records}")

            # Delete in batches to reduce lock contention
            while True:
                delete_query = f"""
                DELETE FROM taxi_fhv_data 
                WHERE NOT (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
                LIMIT {batch_size}
                """

                # Retry mechanism with exponential backoff
                max_retries = 5
                for attempt in range(max_retries):
                    try:
                        cursor.execute(delete_query)
                        conn.commit()
                        rows_deleted = cursor.rowcount
                        total_deleted += rows_deleted

                        logger.info(f"Batch delete - Rows deleted: {rows_deleted}")
                        print(f"Batch delete - Rows deleted: {rows_deleted}")

                        # Exit if no more records to delete
                        if rows_deleted == 0:
                            break

                        break  # Successful deletion
                    except mysql.connector.Error as err:
                        if err.errno == 1205:  # Lock wait timeout
                            wait_time = 2 ** attempt
                            logger.warning(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            print(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            time.sleep(wait_time)
                            conn.rollback()
                        else:
                            raise

                # Break main loop if no more records
                if rows_deleted == 0:
                    break

            logger.info(f"Total records deleted: {total_deleted}")
            print(f"Total records deleted: {total_deleted}")
            return total_deleted

        except Exception as e:
            if conn:
                conn.rollback()
            logger.error(f"Deletion error: {e}")
            print(f"Deletion error: {e}")
            raise
        finally:
            # Ensure resources are closed
            if cursor:
                cursor.close()
            if conn:
                conn.close()
            
            # Print and log end of deletion process
            print("Deletion process completed.")
            logger.info("Deletion process completed.")

def main():
    # Execution
    db_manager = DatabaseManager()
    start_date = '2024-01-01'
    end_date = '2024-08-31'
    
    try:
        deleted_count = db_manager.delete_records_outside_date_range(start_date, end_date)
        print(f"Successfully deleted {deleted_count} records.")
    except Exception as e:
        print(f"Deletion process failed: {e}")

if __name__ == "__main__":
    main()


Starting deletion process...


## Eliminar FHV en taxi_fhv_data

se elimina por fecha cada 15 dias por temas de optimizar

In [None]:
import mysql.connector
from dotenv import load_dotenv
import os
import time
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        # Load environment variables
        load_dotenv()

        # Get database credentials from environment variables
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """Create a database connection"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            return conn
        except mysql.connector.Error as err:
            logger.error(f"🔴 Connection error: {err}")
            raise

    def delete_records_with_source_U_in_date_range(self, start_date, end_date, batch_size=100000):
        """
        Delete records where source is 'U' within a specified date range, in 15-day increments, with retry and batching
        
        :param start_date: Start date for the range in 'YYYY-MM-DD' format
        :param end_date: End date for the range in 'YYYY-MM-DD' format
        :param batch_size: Number of records to delete in each batch
        """
        conn = None
        cursor = None
        total_deleted = 0

        try:
            conn = self._create_connection()
            conn.autocommit = False  # Disable autocommit
            cursor = conn.cursor()

            # Convert start_date and end_date to datetime objects
            current_date = datetime.strptime(start_date, '%Y-%m-%d')
            end_date = datetime.strptime(end_date, '%Y-%m-%d')
            
            # Print and log start of deletion process
            print("🚀 Starting deletion process...")
            logger.info("🚀 Starting deletion process...")

            # Iterate over the date range in 15-day increments
            while current_date <= end_date:
                next_date = current_date + timedelta(days=5)
                
                # Initial check to estimate total records to delete in this 15-day range
                count_query = f"""
                SELECT COUNT(*) FROM taxi_fhv_data 
                WHERE source = 'U' 
                AND Pickup_datetime BETWEEN '{current_date.strftime('%Y-%m-%d')}' AND '{next_date.strftime('%Y-%m-%d')}'
                """
                cursor.execute(count_query)
                total_records = cursor.fetchone()[0]
                logger.info(f"📊 Total records to delete from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {total_records}")
                print(f"📊 Total records to delete from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {total_records}")

                # Delete in batches to reduce lock contention
                while True:
                    delete_query = f"""
                    DELETE FROM taxi_fhv_data 
                    WHERE source = 'U'
                    AND Pickup_datetime BETWEEN '{current_date.strftime('%Y-%m-%d')}' AND '{next_date.strftime('%Y-%m-%d')}'
                    LIMIT {batch_size}
                    """

                    # Retry mechanism with exponential backoff
                    max_retries = 5
                    for attempt in range(max_retries):
                        try:
                            cursor.execute(delete_query)
                            conn.commit()
                            rows_deleted = cursor.rowcount
                            total_deleted += rows_deleted

                            logger.info(f"🗑️ Batch delete - Rows deleted from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {rows_deleted}")
                            print(f"🗑️ Batch delete - Rows deleted from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {rows_deleted}")

                            # Exit if no more records to delete
                            if rows_deleted == 0:
                                break

                            break  # Successful deletion
                        except mysql.connector.Error as err:
                            if err.errno == 1205:  # Lock wait timeout
                                wait_time = 2 ** attempt
                                logger.warning(f"⏳ Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                                print(f"⏳ Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                                time.sleep(wait_time)
                                conn.rollback()
                            else:
                                raise

                    # Break main loop if no more records
                    if rows_deleted == 0:
                        break

                current_date = next_date + timedelta(days=5)

            logger.info(f"📈 Total records deleted: {total_deleted}")
            print(f"📈 Total records deleted: {total_deleted}")
            return total_deleted

        except Exception as e:
            if conn:
                conn.rollback()
            logger.error(f"❌ Deletion error: {e}")
            print(f"❌ Deletion error: {e}")
            raise
        finally:
            # Ensure resources are closed
            if cursor:
                cursor.close()
            if conn:
                conn.close()
            
            # Print and log end of deletion process
            print("✅ Deletion process completed.")
            logger.info("✅ Deletion process completed.")

def main():
    # Execution
    db_manager = DatabaseManager()
    start_date = '2024-08-26'
    end_date = '2024-08-31'
    
    try:
        deleted_count = db_manager.delete_records_with_source_U_in_date_range(start_date, end_date)
        print(f"✅ Successfully deleted {deleted_count} records.")
    except Exception as e:
        print(f"❌ Deletion process failed: {e}")

if __name__ == "__main__":
    main()

🚀 Starting deletion process...


## Crear la nueva tabla

In [None]:
import pymysql
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
import numpy as np

# 🌟 Load environment variables
load_dotenv()

# 🔐 Database connection configuration
DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'port': int(os.getenv("DB_PORT", 3306)),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASSWORD"),
    'database': os.getenv("DB_NAME")
}

def populate_enriched_taxi_data_table(batch_size=100000):
    try:
        # 🚀 Create SQLAlchemy engine for bulk operations
        engine = create_engine(
            f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
        )
        
        # 🔗 Establish initial connection to execute query
        connection = pymysql.connect(**DB_CONFIG, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
        cursor = connection.cursor()
        
        # 📊 Main extraction query
        query = """
        SELECT 
            t.Pickup_datetime,
            t.DropOff_datetime,
            t.PULocationID,
            t.DOLocationID,
            z.Borough AS pickup_borough,
            tr.Vol AS traffic_volume,
            YEAR(t.Pickup_datetime) AS pickup_year,
            MONTH(t.Pickup_datetime) AS pickup_month,
            DAY(t.Pickup_datetime) AS pickup_day,
            HOUR(t.Pickup_datetime) AS pickup_hour,
            MINUTE(t.Pickup_datetime) AS pickup_minute,
            t.source
        FROM 
            taxi_fhv_data t
        JOIN 
            taxi_zones z ON t.PULocationID = z.LocationID
        LEFT JOIN 
            trafico tr ON YEAR(t.Pickup_datetime) = tr.Yr 
                       AND MONTH(t.Pickup_datetime) = tr.M 
                       AND DAY(t.Pickup_datetime) = tr.D 
                       AND HOUR(t.Pickup_datetime) = tr.HH 
                       AND MINUTE(t.Pickup_datetime) = tr.MM
        """
        
        # 🔄 Process data in batches
        offset = 0
        while True:
            # 🧩 Modify query to use LIMIT and OFFSET for batch processing
            batched_query = f"{query} LIMIT {batch_size} OFFSET {offset}"
            df = pd.read_sql(batched_query, connection)
            
            # 🛑 Break if no more records
            if df.empty:
                break
            
            # 🧹 Handle potential None/NaN values
            df['traffic_volume'] = df['traffic_volume'].fillna(0)
            
            # 💾 Bulk insert using SQLAlchemy
            df.to_sql('enriched_taxi_data', 
                      engine, 
                      if_exists='append', 
                      index=False, 
                      chunksize=10000)
            
            # ⏩ Update offset
            offset += batch_size
            print(f"🚦 Processed batch: {offset} rows")
        
        connection.close()
        engine.dispose()
        print("✅ Data enrichment completed successfully")

    except Exception as e:
        print(f"❌ Error during data enrichment: {e}")

# 🏁 Execute the enrichment process
populate_enriched_taxi_data_table()

## 🗄️ Database Manager - Sistema de Gestión de Datos de Taxis FHV

🚕 Sistema de Consulta y Logging para Base de Datos de Taxis

In [None]:
import mysql.connector
from dotenv import load_dotenv
import os
import logging

# 📝 Configuración del logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_query.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    """🗄️ Administrador de conexiones y consultas a la base de datos"""
    
    def __init__(self):
        # 🔑 Cargar variables de entorno
        load_dotenv()
        
        # 🔒 Obtener credenciales de la base de datos
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """🔌 Crear una conexión a la base de datos"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            logger.info("🟢 Conexión establecida exitosamente")
            return conn
        except mysql.connector.Error as err:
            logger.error(f"🔴 Error de conexión: {err}")
            raise

    def count_records(self):
        """📊 Ejecutar un COUNT(1) en la tabla taxi_fhv_data"""
        conn = None
        cursor = None
        try:
            conn = self._create_connection()
            cursor = conn.cursor()
            
            # 🔍 Ejecutar la consulta COUNT(1)
            query = "SELECT COUNT(1) FROM taxi_fhv_data WHERE source = 'U'"
            cursor.execute(query)
            count = cursor.fetchone()[0]
            logger.info(f"📈 Total de registros: {count}")
            print(f"📈 Total de registros: {count}")
            return count
            
        except Exception as e:
            logger.error(f"❌ Error en la consulta: {e}")
            print(f"❌ Error en la consulta: {e}")
            raise
            
        finally:
            # 🧹 Limpieza de recursos
            if cursor:
                cursor.close()
            if conn:
                conn.close()
                logger.info("🔌 Conexión cerrada")
            
            # ✅ Registro de finalización
            print("✅ Proceso de consulta completado")
            logger.info("✅ Proceso de consulta completado")

def main():
    """🎯 Función principal de ejecución"""
    db_manager = DatabaseManager()
    
    try:
        record_count = db_manager.count_records()
        print(f"🎉 Conteo exitoso: {record_count} registros")
    except Exception as e:
        print(f"❌ El proceso falló: {e}")

if __name__ == "__main__":
    main()

In [None]:
import mysql.connector
from dotenv import load_dotenv
import os
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_indexing.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        load_dotenv()
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            logger.info("🟢 Conexión establecida exitosamente")
            return conn
        except mysql.connector.Error as err:
            logger.error(f"🔴 Error de conexión: {err}")
            raise

    def create_indexes(self):
        """🔍 Crear índices para optimizar consultas"""
        conn = None
        cursor = None
        try:
            conn = self._create_connection()
            cursor = conn.cursor()

            # Lista de índices a crear con verificación previa
            indexes = [
                "CREATE INDEX idx_pickup_dropoff ON taxi_fhv_data (pickup_datetime, dropoff_datetime)",
                "CREATE INDEX idx_pickup ON taxi_fhv_data (pickup_datetime)",
                "CREATE INDEX idx_dropoff ON taxi_fhv_data (dropoff_datetime)"
            ]

            existing_indexes_query = """
            SELECT index_name
            FROM information_schema.statistics
            WHERE table_schema = %s
            AND table_name = 'taxi_fhv_data'
            """
            cursor.execute(existing_indexes_query, (self.db_name,))
            existing_indexes = cursor.fetchall()
            existing_indexes = [row[0] for row in existing_indexes]

            # Crear índices solo si no existen
            for index_query in indexes:
                index_name = index_query.split()[2]
                if index_name not in existing_indexes:
                    cursor.execute(index_query)
                    logger.info(f"✅ Índice creado: {index_query}")
                    print(f"✅ Índice creado: {index_query}")
                else:
                    logger.info(f"🟡 Índice ya existe: {index_name}")
                    print(f"🟡 Índice ya existe: {index_name}")

            # Confirmar cambios
            conn.commit()
            logger.info("🎉 Todos los índices creados exitosamente")
            print("🎉 Todos los índices creados exitosamente")

        except Exception as e:
            logger.error(f"❌ Error al crear índices: {e}")
            print(f"❌ Error al crear índices: {e}")
            raise

        finally:
            # Limpiar recursos
            if cursor:
                cursor.close()
            if conn:
                conn.close()
                logger.info("🔌 Conexión cerrada")

def main():
    """🎯 Función principal de ejecución"""
    db_manager = DatabaseManager()
    
    try:
        db_manager.create_indexes()
    except Exception as e:
        print(f"❌ El proceso falló: {e}")

if __name__ == "__main__":
    main()
