## Eliminar fechas fuera del rango necesario en MYSQL

Se utiliza un codigo ejecutado local o en colab para limpieza de Taxis_fhv_data 

In [3]:
import mysql.connector
from dotenv import load_dotenv
import os
import time
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        # Load environment variables
        load_dotenv()

        # Get database credentials from environment variables
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """Create a database connection"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            return conn
        except mysql.connector.Error as err:
            logger.error(f"Connection error: {err}")
            raise

    def delete_records_outside_date_range(self, start_date, end_date, batch_size=10000):
        """
        Delete records outside specified date range with retry and batching
        
        :param start_date: Start date for keeping records
        :param end_date: End date for keeping records
        :param batch_size: Number of records to delete in each batch
        """
        conn = None
        cursor = None
        total_deleted = 0

        try:
            conn = self._create_connection()
            conn.autocommit = False  # Disable autocommit
            cursor = conn.cursor()
            
            # Print and log start of deletion process
            print("Starting deletion process...")
            logger.info("Starting deletion process...")

            # Validate date format
            try:
                datetime.strptime(start_date, '%Y-%m-%d')
                datetime.strptime(end_date, '%Y-%m-%d')
            except ValueError:
                logger.error("Invalid date format. Use YYYY-MM-DD")
                print("Invalid date format. Use YYYY-MM-DD")
                return 0

            # Initial check to estimate total records to delete
            count_query = f"""
            SELECT COUNT(*) FROM taxi_fhv_data 
            WHERE NOT (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
            """
            cursor.execute(count_query)
            total_records = cursor.fetchone()[0]
            logger.info(f"Total records to delete: {total_records}")
            print(f"Total records to delete: {total_records}")

            # Delete in batches to reduce lock contention
            while True:
                delete_query = f"""
                DELETE FROM taxi_fhv_data 
                WHERE NOT (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
                LIMIT {batch_size}
                """

                # Retry mechanism with exponential backoff
                max_retries = 5
                for attempt in range(max_retries):
                    try:
                        cursor.execute(delete_query)
                        conn.commit()
                        rows_deleted = cursor.rowcount
                        total_deleted += rows_deleted

                        logger.info(f"Batch delete - Rows deleted: {rows_deleted}")
                        print(f"Batch delete - Rows deleted: {rows_deleted}")

                        # Exit if no more records to delete
                        if rows_deleted == 0:
                            break

                        break  # Successful deletion
                    except mysql.connector.Error as err:
                        if err.errno == 1205:  # Lock wait timeout
                            wait_time = 2 ** attempt
                            logger.warning(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            print(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            time.sleep(wait_time)
                            conn.rollback()
                        else:
                            raise

                # Break main loop if no more records
                if rows_deleted == 0:
                    break

            logger.info(f"Total records deleted: {total_deleted}")
            print(f"Total records deleted: {total_deleted}")
            return total_deleted

        except Exception as e:
            if conn:
                conn.rollback()
            logger.error(f"Deletion error: {e}")
            print(f"Deletion error: {e}")
            raise
        finally:
            # Ensure resources are closed
            if cursor:
                cursor.close()
            if conn:
                conn.close()
            
            # Print and log end of deletion process
            print("Deletion process completed.")
            logger.info("Deletion process completed.")

def main():
    # Execution
    db_manager = DatabaseManager()
    start_date = '2023-01-01'
    end_date = '2024-08-31'
    
    try:
        deleted_count = db_manager.delete_records_outside_date_range(start_date, end_date)
        print(f"Successfully deleted {deleted_count} records.")
    except Exception as e:
        print(f"Deletion process failed: {e}")

if __name__ == "__main__":
    main()


Starting deletion process...
Total records to delete: 0
Batch delete - Rows deleted: 0
Total records deleted: 0
Deletion process completed.
Successfully deleted 0 records.


In [None]:
import os
from dotenv import load_dotenv
import mysql.connector
import logging
import time
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

def delete_rows(batch_size=500000):
    try:
        print("🔹 Initializing database deletion process...")
        logger.info("Initializing database deletion process")

        # Load environment variables
        load_dotenv()
        print("✓ Environment variables loaded successfully")

        # Database connection variables
        DB_HOST = os.getenv("DB_HOST")
        DB_USER = os.getenv("DB_USER")
        DB_PASSWORD = os.getenv("DB_PASSWORD")
        DB_NAME = os.getenv("DB_NAME")

        print(f"🔌 Connecting to database: {DB_NAME} on host {DB_HOST}")
        # Connect to the database
        connection = mysql.connector.connect(
            host=DB_HOST,
            user=DB_USER,
            password=DB_PASSWORD,
            database=DB_NAME
        )
        connection.autocommit = False  # Disable autocommit
        cursor = connection.cursor()
        print("✓ Database connection established")

        # Initial count of rows to delete
        count_query = (
            "SELECT COUNT(*) FROM taxi_fhv_data "
            "WHERE source = 'U' AND Pickup_datetime => '2023-07-01'"
        )
        cursor.execute(count_query)
        total_records = cursor.fetchone()[0]
        print(f"📊 Total records matching deletion criteria: {total_records}")
        logger.info(f"Total records to delete: {total_records}")

        total_deleted = 0
        batch_number = 0

        # Delete in batches to reduce lock contention
        while True:
            batch_number += 1
            print(f"\n🔄 Processing Batch {batch_number}")

            # Delete query
            delete_query = (
                "DELETE FROM taxi_fhv_data "
                "WHERE source = 'U' AND Pickup_datetime => '2023-07-01' "
                "LIMIT %s"
            )

            # Retry mechanism with exponential backoff
            max_retries = 5
            for attempt in range(max_retries):
                try:
                    print(f"  Attempting deletion - Attempt {attempt + 1}")
                    cursor.execute(delete_query, (batch_size,))
                    connection.commit()
                    rows_deleted = cursor.rowcount
                    total_deleted += rows_deleted

                    print(f"  ✅ Batch {batch_number} - Rows deleted: {rows_deleted}")
                    print(f"  🔢 Total deleted so far: {total_deleted}")
                    logger.info(f"Batch {batch_number} - Rows deleted: {rows_deleted}")

                    # Exit if no more records to delete
                    if rows_deleted == 0:
                        print("❗ No more records to delete. Terminating process.")
                        break

                    break  # Successful deletion
                except mysql.connector.Error as err:
                    if err.errno == 1205:  # Lock wait timeout
                        wait_time = 2 ** attempt
                        print(f"  ⏳ Lock timeout detected. Waiting {wait_time} seconds")
                        logger.warning(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                        time.sleep(wait_time)
                        connection.rollback()
                    else:
                        raise

            # Break main loop if no more records
            if rows_deleted == 0:
                break

        print(f"\n✨ Deletion Process Complete")
        print(f"📈 Total records deleted: {total_deleted}")
        logger.info(f"Total records deleted: {total_deleted}")
        return total_deleted

    except mysql.connector.Error as err:
        print(f"❌ Database Error: {err}")
        logger.error(f"Error: {err}")
        raise
    finally:
        # Ensure resources are closed
        if 'cursor' in locals():
            cursor.close()
        if 'connection' in locals() and connection.is_connected():
            connection.close()
            print("🔌 Database connection closed.")
            logger.info("Database connection closed.")

# Execute the function
def main():
    try:
        print("🚀 Starting deletion script...")
        deleted_count = delete_rows()
        print(f"✅ Successfully deleted {deleted_count} rows.")
    except Exception as e:
        print(f"❌ Deletion process failed: {e}")

if __name__ == "__main__":
    main()

Starting deletion process...
