In [1]:
#!pip install pyspark
#!pip install google-cloud
#!pip install google-cloud-pubsub

In [2]:
#!wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-8.0.27.tar.gz
#!tar -xvzf mysql-connector-java-8.0.27.tar.gz
#!ls mysql-connector-java-8.0.27

In [None]:
from pyspark import SparkConf, SparkContext, SQLContext
from time import sleep
import os
from google.cloud import pubsub_v1
from google.api_core.exceptions import NotFound
import threading
import json

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'/content/drive/MyDrive/Colab Notebooks/PEA/service-account-admin.json'

In [None]:
# Var project
project_id = 'ingenieriadatos-392001'

# Var pub/sub
# clientes
topic_id = 'clientes-topic'
subscription_name="clientes-subs"
#transacciones
topic_id_tr = 'transactions-topic'
subscription_name_tr="transactions-topic-subs"

In [None]:
# Funciones

# Función para que el suscriptor reconozca el mensaje
def callback(message):
    print(f"Mensaje recibido: {message.data}")
    message.ack()  # Confirma el mensaje
    print("Mensaje confirmado.")

# Función para mandar los mensajes de Pub/Sub a Cloud Function
def process_clientes(project_id, topic_id, subscription_name, df=None, batch_size=10):
    publisher = pubsub_v1.PublisherClient() # publicador
    subscriber = pubsub_v1.SubscriberClient() # suscriptor

    topic_path = publisher.topic_path(project_id, topic_id)
    subscription_path = subscriber.subscription_path(project_id, subscription_name)

    # Iniciar la suscripción en un hilo separado
    thread = threading.Thread(target=subscriber.subscribe, args=(subscription_path,), kwargs={"callback": callback}) # confirma los mensajes
    thread.start()

    # Si se proporciona un DataFrame (df), enviar los datos a Pub/Sub en grupos de 10 mensajes
    if df is not None:
        messages = []
        for row in df.collect():
            data = row.asDict()
            messages.append(data)

            if len(messages) >= batch_size:
                for batch_start in range(0, len(messages), batch_size):
                    batch_messages = messages[batch_start:batch_start + batch_size]
                    batch_messages_json = [json.dumps(message) for message in batch_messages]
                    batch_message = "\n".join(batch_messages_json)
                    future = publisher.publish(topic_path, batch_message.encode("utf-8"))
                    sleep(1)
                messages = []

        if messages:
            batch_messages = [json.dumps(message) for message in messages]
            batch_message = "\n".join(batch_messages)
            future = publisher.publish(topic_path, batch_message.encode("utf-8"))

    # Esperar a que el hilo de suscripción termine
    thread.join()

In [None]:
# Sesión y context Spark
conf = SparkConf().setAppName("AppName").setMaster("local").set("spark.driver.extraClassPath", "/content/mysql-connector-java-8.0.27/mysql-connector-java-8.0.27.jar")
sc = SparkContext.getOrCreate(conf)
spark = SQLContext(sc)



In [None]:
HOST = '147.182.246.47'
PORT = 3306
USERDB = 'user07'
DATABASE = 'databank'
PASSWORD = 'user07ABC'
URL = f'jdbc:mysql://{HOST}:{PORT}/{DATABASE}'

offset = 0
tam_proceso = 10
max_registros = 50000

while offset < max_registros:
    query = f"(SELECT id, sexo, DATE_FORMAT(fecha_alta, '%d-%m-%Y') as fecha_alta, ingreso, legal_person, age, nivel_educativo FROM clientes WHERE id IN (SELECT id_deposit_client FROM transaction) LIMIT {tam_proceso} OFFSET {offset}) as clientes"
    df_clientes = spark.read.format("jdbc").options(
        url=URL,
        driver="com.mysql.cj.jdbc.Driver",
        dbtable=query,
        user=USERDB,
        password=PASSWORD).load()

    # Si no hay más datos al leer, termina
    if df_clientes.count() == 0:
        break
    # Llamar a la función de Pub/Sub
    process_clientes(project_id, topic_id, subscription_name, df_clientes)
    offset += tam_proceso
    sleep(1)

Mensaje recibido: b'{"id": 445, "sexo": "M", "fecha_alta": "30-08-1973", "ingreso": 0.0, "legal_person": false, "age": 68, "nivel_educativo": "1"}\n{"id": 446, "sexo": "M", "fecha_alta": "10-05-1961", "ingreso": 62.985465079274945, "legal_person": false, "age": 80, "nivel_educativo": "3"}\n{"id": 447, "sexo": "M", "fecha_alta": "16-03-1989", "ingreso": 0.0, "legal_person": false, "age": 52, "nivel_educativo": "3"}\n{"id": 448, "sexo": "M", "fecha_alta": "17-02-1975", "ingreso": 0.0, "legal_person": false, "age": 66, "nivel_educativo": "1"}\n{"id": 449, "sexo": "M", "fecha_alta": "15-06-2011", "ingreso": 5911.784006825921, "legal_person": false, "age": 30, "nivel_educativo": "4"}\n{"id": 450, "sexo": "M", "fecha_alta": "20-08-1981", "ingreso": 384.8085168162185, "legal_person": false, "age": 60, "nivel_educativo": "2"}\n{"id": 1586, "sexo": "M", "fecha_alta": "12-11-2000", "ingreso": 0.0, "legal_person": false, "age": 41, "nivel_educativo": "4"}\n{"id": 1588, "sexo": "M", "fecha_alta": 

KeyboardInterrupt: ignored