## Script de Importación de Trafico de NYC a MySQL

In [6]:
import os
from dotenv import load_dotenv
import pandas as pd
import mysql.connector
from datetime import datetime

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Obtener las variables de entorno
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_name = os.getenv('DB_NAME')
db_port = os.getenv('DB_PORT')
csv_file_path = 'Automated_Traffic_Volume_Counts.csv'  # Asegúrate de actualizar esto con tu archivo CSV correcto

# Verificar si las variables de entorno se cargaron correctamente
if not all([db_host, db_user, db_password, db_name, db_port]):
    raise ValueError("No se pudieron cargar todas las variables de entorno. Por favor verifica el archivo .env.")

# Cargar el archivo CSV en un DataFrame de pandas
try:
    df = pd.read_csv(csv_file_path)
except FileNotFoundError as e:
    raise FileNotFoundError(f"El archivo {csv_file_path} no se encuentra. Por favor verifica la ruta.")

# Filtrar los datos desde enero 2023 hasta agosto 2024
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 8, 31)

# Ensure the columns are in the correct format
df['Yr'] = df['Yr'].astype(int)
df['M'] = df['M'].astype(int)
df['D'] = df['D'].astype(int)
df['HH'] = df['HH'].astype(int)
df['MM'] = df['MM'].astype(int)

# Create a datetime column
df['datetime'] = pd.to_datetime(df[['Yr', 'M', 'D', 'HH', 'MM']].rename(columns={'Yr': 'year', 'M': 'month', 'D': 'day', 'HH': 'hour', 'MM': 'minute'}))

df_filtered = df[(df['datetime'] >= start_date) & (df['datetime'] <= end_date)].copy()

# Seleccionar solo las columnas necesarias para el análisis
columns_to_keep = ['RequestID', 'Boro', 'Yr', 'M', 'D', 'HH', 'MM', 'Vol', 'SegmentID', 'WktGeom', 'street', 'Direction']
df_filtered = df_filtered[columns_to_keep]

# Conectar a la base de datos MySQL
try:
    db_connection = mysql.connector.connect(
        host=db_host,
        user=db_user,
        password=db_password,
        database=db_name,
        port=int(db_port)
    )
except mysql.connector.Error as err:
    raise mysql.connector.Error(f"Error al conectar a la base de datos: {err}")

cursor = db_connection.cursor()

# Insertar los datos filtrados del DataFrame en bloques de 100,000 filas
chunk_size = 100000
rows_inserted = 0
for start in range(0, len(df_filtered), chunk_size):
    end = start + chunk_size
    chunk = df_filtered.iloc[start:end]

    insert_query = """
    INSERT INTO trafico (RequestID, Boro, Yr, M, D, HH, MM, Vol, SegmentID, WktGeom, Direction)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    data = [(
        row['RequestID'], row['Boro'], row['Yr'], row['M'], row['D'], row['HH'], row['MM'], row['Vol'],
        row['SegmentID'], row['WktGeom'], row['Direction']
    ) for _, row in chunk.iterrows()]

    cursor.executemany(insert_query, data)
    rows_inserted += len(data)
    
    # Imprimir el progreso solo cada 10 bloques
    if start // chunk_size % 10 == 0:
        print(f"Se han insertado {len(data)} filas en este bloque. Total filas insertadas: {rows_inserted}")

# Confirmar los cambios y cerrar la conexión
db_connection.commit()
cursor.close()
db_connection.close()

# Mostrar el número total de filas insertadas
print(f"Se han insertado un total de {rows_inserted} filas en la base de datos.")


Se han insertado 100000 filas en este bloque. Total filas insertadas: 100000
Se han insertado un total de 109248 filas en la base de datos.


## Script de Importación de Temperaturas Promedio de NYC a MySQL


In [2]:
import os
from dotenv import load_dotenv
import pandas as pd
import mysql.connector

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Obtener las variables de entorno
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_name = os.getenv('DB_NAME')
db_port = os.getenv('DB_PORT')
csv_file_path = 'temperaturas_promedio_nyc_mensual.csv'  # Asegúrate de actualizar esto con tu archivo CSV correcto

# Verificar si las variables de entorno se cargaron correctamente
if not all([db_host, db_user, db_password, db_name, db_port]):
    raise ValueError("No se pudieron cargar todas las variables de entorno. Por favor verifica el archivo .env.")

# Cargar el archivo CSV en un DataFrame de pandas
try:
    df = pd.read_csv(csv_file_path)
except FileNotFoundError as e:
    raise FileNotFoundError(f"El archivo {csv_file_path} no se encuentra. Por favor verifica la ruta.")

# Conectar a la base de datos MySQL
try:
    db_connection = mysql.connector.connect(
        host=db_host,
        user=db_user,
        password=db_password,
        database=db_name,
        port=int(db_port)
    )
except mysql.connector.Error as err:
    raise mysql.connector.Error(f"Error al conectar a la base de datos: {err}")

cursor = db_connection.cursor()

# Insertar los datos del DataFrame en la tabla MySQL
insert_query = """
INSERT INTO temperaturas (Mes, Manhattan, Brooklyn, Queens, The_Bronx, Staten_Island)
VALUES (%s, %s, %s, %s, %s, %s)
"""
data = [tuple(row) for row in df.values]

cursor.executemany(insert_query, data)

# Confirmar los cambios y cerrar la conexión
db_connection.commit()
cursor.close()
db_connection.close()

# Mostrar el número de filas insertadas
print(f"Se han insertado {len(data)} filas en la base de datos.")


Se han insertado 12 filas en la base de datos.


## Script de Importación de Taxi_Zones de NYC a MySQL


In [None]:
import json
import pandas as pd
import pymysql
import os
from dotenv import load_dotenv

# Cargar variables de entorno
load_dotenv()

# Configuración de la conexión a la base de datos
def procesar_archivo_taxi_zones(ruta_archivo):
    try:
        # Configuración de base de datos
        db_host = os.getenv("DB_HOST")
        db_port = int(os.getenv("DB_PORT", 3306))
        db_user = os.getenv("DB_USER")
        db_password = os.getenv("DB_PASSWORD")
        db_name = os.getenv("DB_NAME")

        # Leer el archivo CSV
        df = pd.read_csv(ruta_archivo)
        print(f"✅ Archivo leído: {ruta_archivo}")
        print(f"📊 Columnas: {df.columns.tolist()}")

        # Validar columnas
        expected_columns = ['OBJECTID', 'Shape_Leng', 'the_geom', 'Shape_Area', 'zone', 'LocationID', 'borough']
        if not all(column in df.columns for column in expected_columns):
            print(f"❌ Columnas faltantes: {expected_columns}")
            return False

        # Eliminar duplicados
        duplicados = df[df.duplicated(subset=['LocationID'], keep=False)]
        if not duplicados.empty:
            print(f"🚨 Duplicados encontrados: {len(duplicados)}")
            df = df.drop_duplicates(subset=['LocationID'])

        # Conectar a la base de datos
        connection = pymysql.connect(
            host=db_host,
            port=db_port,
            user=db_user,
            password=db_password,
            database=db_name,
            charset='utf8mb4',
            cursorclass=pymysql.cursors.DictCursor
        )
        print("🔌 Conexión a MySQL establecida")

        try:
            cursor = connection.cursor()
            
            # Preparar consulta de inserción
            insert_query = """
            INSERT IGNORE INTO taxi_zones (OBJECTID, Shape_Leng, the_geom, Shape_Area, zone, LocationID, borough)
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            """
            
            # Preparar datos para inserción
            data = [(
                row['OBJECTID'],
                row['Shape_Leng'],
                row['the_geom'],
                row['Shape_Area'],
                row['zone'],
                row['LocationID'],
                row['borough']
            ) for _, row in df.iterrows()]
            
            # Ejecutar inserción
            cursor.executemany(insert_query, data)

            # Confirmar cambios
            connection.commit()
            print(f"✅ {len(df)} filas insertadas exitosamente")
            return True

        except Exception as e:
            print(f"❌ Error en inserción: {e}")
            return False
        finally:
            cursor.close()
            connection.close()

    except Exception as e:
        print(f"❌ Error general: {e}")
        return False

# Uso del script
if __name__ == "__main__":
    ruta_archivo = "taxi_zones.csv"  # Cambia esto por la ruta de tu archivo
    resultado = procesar_archivo_taxi_zones(ruta_archivo)
    
    if resultado:
        print("🎉 Proceso completado con éxito")
    else:
        print("❌ Proceso fallido")

## Consultas en MYSQL

In [None]:
import mysql.connector
from dotenv import load_dotenv
import os
import logging

# 📝 Configuración del logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_query.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    """🗄️ Administrador de conexiones y consultas a la base de datos"""
    
    def __init__(self):
        # 🔑 Cargar variables de entorno
        load_dotenv()
        
        # 🔒 Obtener credenciales de la base de datos
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """🔌 Crear una conexión a la base de datos"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            logger.info("🟢 Conexión establecida exitosamente")
            return conn
        except mysql.connector.Error as err:
            logger.error(f"🔴 Error de conexión: {err}")
            raise

    def count_records(self):
        """📊 Ejecutar un COUNT(1) en la tabla taxi_fhv_data"""
        conn = None
        cursor = None
        try:
            conn = self._create_connection()
            cursor = conn.cursor()
            
            # 🔍 Ejecutar la consulta COUNT(1)
            query = "SELECT COUNT(1) FROM taxi_fhv_data WHERE source = 'U'"
            cursor.execute(query)
            count = cursor.fetchone()[0]
            logger.info(f"📈 Total de registros: {count}")
            print(f"📈 Total de registros: {count}")
            return count
            
        except Exception as e:
            logger.error(f"❌ Error en la consulta: {e}")
            print(f"❌ Error en la consulta: {e}")
            raise
            
        finally:
            # 🧹 Limpieza de recursos
            if cursor:
                cursor.close()
            if conn:
                conn.close()
                logger.info("🔌 Conexión cerrada")
            
            # ✅ Registro de finalización
            print("✅ Proceso de consulta completado")
            logger.info("✅ Proceso de consulta completado")

def main():
    """🎯 Función principal de ejecución"""
    db_manager = DatabaseManager()
    
    try:
        record_count = db_manager.count_records()
        print(f"🎉 Conteo exitoso: {record_count} registros")
    except Exception as e:
        print(f"❌ El proceso falló: {e}")

if __name__ == "__main__":
    main()

## ELiminar U

In [None]:
import mysql.connector
from dotenv import load_dotenv
import os
import time
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s: %(message)s',
    filename='database_deletion.log'
)
logger = logging.getLogger(__name__)

class DatabaseManager:
    def __init__(self):
        # Load environment variables
        load_dotenv()

        # Get database credentials from environment variables
        self.db_host = os.getenv('DB_HOST')
        self.db_user = os.getenv('DB_USER')
        self.db_password = os.getenv('DB_PASSWORD')
        self.db_name = os.getenv('DB_NAME')

    def _create_connection(self):
        """Create a database connection"""
        try:
            conn = mysql.connector.connect(
                host=self.db_host,
                user=self.db_user,
                password=self.db_password,
                database=self.db_name
            )
            return conn
        except mysql.connector.Error as err:
            logger.error(f"🔴 Connection error: {err}")
            raise

    def delete_records_with_source_U_in_date_range(self, start_date, end_date, batch_size=100000):
        """
        Delete records where source is 'U' within a specified date range, in 15-day increments, with retry and batching
        
        :param start_date: Start date for the range in 'YYYY-MM-DD' format
        :param end_date: End date for the range in 'YYYY-MM-DD' format
        :param batch_size: Number of records to delete in each batch
        """
        conn = None
        cursor = None
        total_deleted = 0

        try:
            conn = self._create_connection()
            conn.autocommit = False  # Disable autocommit
            cursor = conn.cursor()

            # Convert start_date and end_date to datetime objects
            current_date = datetime.strptime(start_date, '%Y-%m-%d')
            end_date = datetime.strptime(end_date, '%Y-%m-%d')
            
            # Print and log start of deletion process
            print("🚀 Starting deletion process...")
            logger.info("🚀 Starting deletion process...")

            # Iterate over the date range in 15-day increments
            while current_date <= end_date:
                next_date = current_date + timedelta(days=5)
                
                # Initial check to estimate total records to delete in this 15-day range
                count_query = f"""
                SELECT COUNT(*) FROM taxi_fhv_data 
                WHERE source = 'U' 
                AND Pickup_datetime BETWEEN '{current_date.strftime('%Y-%m-%d')}' AND '{next_date.strftime('%Y-%m-%d')}'
                """
                cursor.execute(count_query)
                total_records = cursor.fetchone()[0]
                logger.info(f"📊 Total records to delete from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {total_records}")
                print(f"📊 Total records to delete from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {total_records}")

                # Delete in batches to reduce lock contention
                while True:
                    delete_query = f"""
                    DELETE FROM taxi_fhv_data 
                    WHERE source = 'U'
                    AND Pickup_datetime BETWEEN '{current_date.strftime('%Y-%m-%d')}' AND '{next_date.strftime('%Y-%m-%d')}'
                    LIMIT {batch_size}
                    """

                    # Retry mechanism with exponential backoff
                    max_retries = 5
                    for attempt in range(max_retries):
                        try:
                            cursor.execute(delete_query)
                            conn.commit()
                            rows_deleted = cursor.rowcount
                            total_deleted += rows_deleted

                            logger.info(f"🗑️ Batch delete - Rows deleted from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {rows_deleted}")
                            print(f"🗑️ Batch delete - Rows deleted from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}: {rows_deleted}")

                            # Exit if no more records to delete
                            if rows_deleted == 0:
                                break

                            break  # Successful deletion
                        except mysql.connector.Error as err:
                            if err.errno == 1205:  # Lock wait timeout
                                wait_time = 2 ** attempt
                                logger.warning(f"⏳ Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                                print(f"⏳ Lock timeout. Retry {attempt + 1}/{max_retries}. Waiting {wait_time} seconds")
                                time.sleep(wait_time)
                                conn.rollback()
                            else:
                                raise

                    # Break main loop if no more records
                    if rows_deleted == 0:
                        break

                current_date = next_date + timedelta(days=5)

            logger.info(f"📈 Total records deleted: {total_deleted}")
            print(f"📈 Total records deleted: {total_deleted}")
            return total_deleted

        except Exception as e:
            if conn:
                conn.rollback()
            logger.error(f"❌ Deletion error: {e}")
            print(f"❌ Deletion error: {e}")
            raise
        finally:
            # Ensure resources are closed
            if cursor:
                cursor.close()
            if conn:
                conn.close()
            
            # Print and log end of deletion process
            print("✅ Deletion process completed.")
            logger.info("✅ Deletion process completed.")

def main():
    # Execution
    db_manager = DatabaseManager()
    start_date = '2024-08-26'
    end_date = '2024-08-31'
    
    try:
        deleted_count = db_manager.delete_records_with_source_U_in_date_range(start_date, end_date)
        print(f"✅ Successfully deleted {deleted_count} records.")
    except Exception as e:
        print(f"❌ Deletion process failed: {e}")

if __name__ == "__main__":
    main()

🚀 Starting deletion process...
