## Eliminar fechas fuera del rango necesario en MYSQL

Se utiliza un codigo ejecutado local o en colab para limpieza de Taxis_fhv_data 

In [1]:
import mysql.connector
from dotenv import load_dotenv
import os
import time
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        # Load environment variables
        load_dotenv()

        # Get database credentials from environment variables
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """Create a database connection"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            return conn
        except mysql.connector.Error as err:
            logger.error(f"Connection error: {err}")
            raise

    def delete_records_outside_date_range(self, start_date, end_date, batch_size=10000):
        """
        Delete records outside specified date range with retry and batching
        
        :param start_date: Start date for keeping records
        :param end_date: End date for keeping records
        :param batch_size: Number of records to delete in each batch
        """
        conn = None
        cursor = None
        total_deleted = 0

        try:
            conn = self._create_connection()
            conn.autocommit = False  # Disable autocommit
            cursor = conn.cursor()
            
            # Print and log start of deletion process
            print("Starting deletion process...")
            logger.info("Starting deletion process...")

            # Validate date format
            try:
                datetime.strptime(start_date, '%Y-%m-%d')
                datetime.strptime(end_date, '%Y-%m-%d')
            except ValueError:
                logger.error("Invalid date format. Use YYYY-MM-DD")
                print("Invalid date format. Use YYYY-MM-DD")
                return 0

            # Initial check to estimate total records to delete
            count_query = f"""
            SELECT COUNT(*) FROM taxi_fhv_data 
            WHERE NOT (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
            """
            cursor.execute(count_query)
            total_records = cursor.fetchone()[0]
            logger.info(f"Total records to delete: {total_records}")
            print(f"Total records to delete: {total_records}")

            # Delete in batches to reduce lock contention
            while True:
                delete_query = f"""
                DELETE FROM taxi_fhv_data 
                WHERE NOT (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
                LIMIT {batch_size}
                """

                # Retry mechanism with exponential backoff
                max_retries = 5
                for attempt in range(max_retries):
                    try:
                        cursor.execute(delete_query)
                        conn.commit()
                        rows_deleted = cursor.rowcount
                        total_deleted += rows_deleted

                        logger.info(f"Batch delete - Rows deleted: {rows_deleted}")
                        print(f"Batch delete - Rows deleted: {rows_deleted}")

                        # Exit if no more records to delete
                        if rows_deleted == 0:
                            break

                        break  # Successful deletion
                    except mysql.connector.Error as err:
                        if err.errno == 1205:  # Lock wait timeout
                            wait_time = 2 ** attempt
                            logger.warning(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            print(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            time.sleep(wait_time)
                            conn.rollback()
                        else:
                            raise

                # Break main loop if no more records
                if rows_deleted == 0:
                    break

            logger.info(f"Total records deleted: {total_deleted}")
            print(f"Total records deleted: {total_deleted}")
            return total_deleted

        except Exception as e:
            if conn:
                conn.rollback()
            logger.error(f"Deletion error: {e}")
            print(f"Deletion error: {e}")
            raise
        finally:
            # Ensure resources are closed
            if cursor:
                cursor.close()
            if conn:
                conn.close()
            
            # Print and log end of deletion process
            print("Deletion process completed.")
            logger.info("Deletion process completed.")

def main():
    # Execution
    db_manager = DatabaseManager()
    start_date = '2024-01-01'
    end_date = '2024-08-31'
    
    try:
        deleted_count = db_manager.delete_records_outside_date_range(start_date, end_date)
        print(f"Successfully deleted {deleted_count} records.")
    except Exception as e:
        print(f"Deletion process failed: {e}")

if __name__ == "__main__":
    main()


Starting deletion process...
Total records to delete: 126886
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Batch delete - Rows deleted: 10000
Deletion process completed.


KeyboardInterrupt: 

In [None]:
import mysql.connector
from dotenv import load_dotenv
import os
import time
import logging
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from mysql.connector import pooling

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        load_dotenv()
        
        # Connection pool configuration
        self.db_config = {
            'host': os.getenv('DB_HOST'),
            'user': os.getenv('DB_USER'),
            'password': os.getenv('DB_PASSWORD'),
            'database': os.getenv('DB_NAME'),
            'pool_name': 'mypool',
            'pool_size': 5,  # Número de conexiones en el pool
            'buffered': True,  # Usar cursores almacenados en buffer
            'get_warnings': False,  # Deshabilitar advertencias
            'raise_on_warnings': False,
            'compress': True  # Compresión de datos
        }
        
        # Crear el pool de conexiones
        self.connection_pool = mysql.connector.pooling.MySQLConnectionPool(**self.db_config)

    def process_date_range(self, current_date, next_date, batch_size):
        """Procesa un rango de fechas específico"""
        conn = None
        cursor = None
        range_deleted = 0
        
        try:
            conn = self.connection_pool.get_connection()
            cursor = conn.cursor(buffered=True)
            
            # Usar índices en la consulta y optimizar la condición WHERE
            delete_query = f"""
            DELETE FROM taxi_fhv_data FORCE INDEX (idx_source_pickup)
            WHERE source = 'U'
            AND Pickup_datetime >= '{current_date.strftime('%Y-%m-%d')}'
            AND Pickup_datetime < '{next_date.strftime('%Y-%m-%d')}'
            LIMIT {batch_size}
            """
            
            while True:
                try:
                    cursor.execute(delete_query)
                    conn.commit()
                    rows_deleted = cursor.rowcount
                    
                    if rows_deleted == 0:
                        break
                        
                    range_deleted += rows_deleted
                    logger.info(f"🗑️ Deleted {rows_deleted} records for {current_date.strftime('%Y-%m-%d')}")
                    
                except mysql.connector.Error as err:
                    if err.errno == 1205:  # Lock timeout
                        conn.rollback()
                        time.sleep(1)
                        continue
                    raise
                    
            return range_deleted
            
        finally:
            if cursor:
                cursor.close()
            if conn:
                conn.close()

    def delete_records_with_source_U_in_date_range(self, start_date, end_date, batch_size=250000):
        """
        Delete records optimized with parallel processing and connection pooling
        """
        total_deleted = 0
        print("🚀 Starting optimized deletion process...")
        logger.info("🚀 Starting optimized deletion process...")

        try:
            current_date = datetime.strptime(start_date, '%Y-%m-%d')
            end_date = datetime.strptime(end_date, '%Y-%m-%d')
            
            # Crear rangos de fechas para procesamiento paralelo
            date_ranges = []
            while current_date <= end_date:
                next_date = current_date + timedelta(days=15)
                if next_date > end_date:
                    next_date = end_date + timedelta(days=1)
                date_ranges.append((current_date, next_date))
                current_date = next_date

            # Procesar rangos en paralelo usando ThreadPoolExecutor
            with ThreadPoolExecutor(max_workers=3) as executor:
                futures = []
                for start, end in date_ranges:
                    future = executor.submit(
                        self.process_date_range,
                        start,
                        end,
                        batch_size
                    )
                    futures.append(future)
                
                # Recolectar resultados
                for future in futures:
                    try:
                        range_deleted = future.result()
                        total_deleted += range_deleted
                    except Exception as e:
                        logger.error(f"❌ Error in worker thread: {e}")
                        raise

            logger.info(f"📈 Total records deleted: {total_deleted}")
            print(f"📈 Total records deleted: {total_deleted}")
            return total_deleted

        except Exception as e:
            logger.error(f"❌ Deletion error: {e}")
            print(f"❌ Deletion error: {e}")
            raise
        finally:
            print("✅ Deletion process completed.")
            logger.info("✅ Deletion process completed.")

def main():
    # Asegurarse de que existe el índice necesario
    # Ejecutar antes de correr el script:
    """
    CREATE INDEX idx_source_pickup ON taxi_fhv_data (source, Pickup_datetime);
    """
    
    db_manager = DatabaseManager()
    start_date = '2024-03-19'
    end_date = '2024-08-31'
    
    try:
        deleted_count = db_manager.delete_records_with_source_U_in_date_range(start_date, end_date)
        print(f"✅ Successfully deleted {deleted_count} records.")
    except Exception as e:
        print(f"❌ Deletion process failed: {e}")

if __name__ == "__main__":
    main()

🚀 Starting optimized deletion process...
❌ Deletion error: 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'FORCE INDEX (idx_source_pickup)
            WHERE source = 'U'
            AND P' at line 1
✅ Deletion process completed.
❌ Deletion process failed: 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'FORCE INDEX (idx_source_pickup)
            WHERE source = 'U'
            AND P' at line 1
