## Eliminar fechas fuera del rango necesario en MYSQL

Se utiliza un codigo ejecutado local o en colab para limpieza de Taxis_fhv_data 

In [1]:
import mysql.connector
from dotenv import load_dotenv
import os
import time
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        # Load environment variables
        load_dotenv()

        # Get database credentials from environment variables
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """Create a database connection"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            return conn
        except mysql.connector.Error as err:
            logger.error(f"Connection error: {err}")
            raise

    def delete_records_outside_date_range(self, start_date, end_date, batch_size=250000):
        """
        Delete records outside specified date range with retry and batching
        
        :param start_date: Start date for keeping records
        :param end_date: End date for keeping records
        :param batch_size: Number of records to delete in each batch
        """
        conn = None
        cursor = None
        total_deleted = 0

        try:
            conn = self._create_connection()
            conn.autocommit = False  # Disable autocommit
            cursor = conn.cursor()
            
            # Print and log start of deletion process
            print("Starting deletion process...")
            logger.info("Starting deletion process...")

            # Validate date format
            try:
                datetime.strptime(start_date, '%Y-%m-%d')
                datetime.strptime(end_date, '%Y-%m-%d')
            except ValueError:
                logger.error("Invalid date format. Use YYYY-MM-DD")
                print("Invalid date format. Use YYYY-MM-DD")
                return 0

            # Initial check to estimate total records to delete
            count_query = f"""
            SELECT COUNT(*) FROM taxi_fhv_data1
            WHERE  (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
            """
            cursor.execute(count_query)
            total_records = cursor.fetchone()[0]
            logger.info(f"Total records to delete: {total_records}")
            print(f"Total records to delete: {total_records}")

            # Delete in batches to reduce lock contention
            while True:
                delete_query = f"""
                DELETE FROM taxi_fhv_data1 
                WHERE  (Pickup_datetime BETWEEN '{start_date}' AND '{end_date}')
                LIMIT {batch_size}
                """

                # Retry mechanism with exponential backoff
                max_retries = 5
                for attempt in range(max_retries):
                    try:
                        cursor.execute(delete_query)
                        conn.commit()
                        rows_deleted = cursor.rowcount
                        total_deleted += rows_deleted

                        logger.info(f"Batch delete - Rows deleted: {rows_deleted}")
                        print(f"Batch delete - Rows deleted: {rows_deleted}")

                        # Exit if no more records to delete
                        if rows_deleted == 0:
                            break

                        break  # Successful deletion
                    except mysql.connector.Error as err:
                        if err.errno == 1205:  # Lock wait timeout
                            wait_time = 2 ** attempt
                            logger.warning(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            print(f"Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                            time.sleep(wait_time)
                            conn.rollback()
                        else:
                            raise

                # Break main loop if no more records
                if rows_deleted == 0:
                    break

            logger.info(f"Total records deleted: {total_deleted}")
            print(f"Total records deleted: {total_deleted}")
            return total_deleted

        except Exception as e:
            if conn:
                conn.rollback()
            logger.error(f"Deletion error: {e}")
            print(f"Deletion error: {e}")
            raise
        finally:
            # Ensure resources are closed
            if cursor:
                cursor.close()
            if conn:
                conn.close()
            
            # Print and log end of deletion process
            print("Deletion process completed.")
            logger.info("Deletion process completed.")

def main():
    # Execution
    db_manager = DatabaseManager()
    start_date = '2023-05-01'
    end_date = '2024-05-31'
    
    try:
        deleted_count = db_manager.delete_records_outside_date_range(start_date, end_date)
        print(f"Successfully deleted {deleted_count} records.")
    except Exception as e:
        print(f"Deletion process failed: {e}")

if __name__ == "__main__":
    main()


Starting deletion process...
Total records to delete: 3406588
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 250000
Batch delete - Rows deleted: 156588
Batch delete - Rows deleted: 0
Total records deleted: 3406588
Deletion process completed.
Successfully deleted 3406588 records.


## üóÑÔ∏è Database Manager - Sistema de Gesti√≥n de Datos de Taxis FHV

üöï Sistema de Consulta y Logging para Base de Datos de Taxis

In [None]:
from sqlalchemy import create_engine, text

# Configurar la conexi√≥n a la base de datos (aseg√∫rate de tener las credenciales configuradas)
DB_CONFIG = {
    'host': 'database-2.cfou8mqoatn0.us-east-2.rds.amazonaws.com',
    'port': 3306,
    'user': 'admin',
    'password': 'AWSSIEMPRE__',
    'database': 'UrbanTransit'
}

engine = create_engine(
    f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

# Consulta para contar el n√∫mero de registros en la tabla taxi_fhv_data1
query = "SELECT COUNT(1) AS count FROM taxi_fhv_data;"
with engine.connect() as connection:
    result = connection.execute(text(query))
    count = result.fetchone()[0]  # Acceder al valor usando un √≠ndice entero
print(f'N√∫mero de registros en la tabla taxi_fhv_data: {count}')


In [4]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Configuraci√≥n de la conexi√≥n a la base de datos usando las variables de entorno
DB_CONFIG = {
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'database': os.getenv('DB_NAME')
}

# Crear el motor de SQLAlchemy
engine = create_engine(
    f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
)

# Leer la tabla "enriched_taxi_data" en un DataFrame de pandas
query = "SELECT * FROM enriched_taxi_data2"
df = pd.read_sql(query, engine)

# Convertir el DataFrame a un archivo CSV
df.to_csv('enriched_taxi_data.csv', index=False)

print('Tabla convertida a CSV y guardada como enriched_taxi_data2.csv')


Tabla convertida a CSV y guardada como enriched_taxi_data2.csv


### Prueba LocalHost docker con Modelo Machine Learning

In [23]:
import requests

# URL de la API
url = 'http://0.tcp.ngrok.io:16013/predict/'

# Datos a enviar en la solicitud
data = {
    "PULocationID": 7,
    "pickup_weekday": 6,
    "pickup_hour": 12
}

# Realizar la solicitud POST
response = requests.post(url, json=data)

# Verificar la respuesta
if response.status_code == 200:
    print("Predicci√≥n:", response.json())
else:
    print("Error:", response.status_code, response.text)


Predicci√≥n: {'max_zone_id': 7, 'max_zone_name': 'Astoria', 'demand': 5.818957687920667}
