In [1]:
!pip install psycopg2 --break-system-packages

Defaulting to user installation because normal site-packages is not writeable


# Consumer for sales data stream

The notebook's purpose is to create a subscriber of the `sales_topic` Kafka topic, which generates sales data of the retail company 

In [2]:
from json import loads
from confluent_kafka import Consumer, KafkaException, KafkaError

import pandas as pd
from dotenv import load_dotenv
import os

import psycopg2
from psycopg2 import sql

In [3]:
load_dotenv()

True

### Defining the conexion for cloud datawarehouse 

In [4]:
db_params = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSPORT'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT'),
}

In [5]:
def insert_row(sales_record):
    """
    Insert sales row to datawarehouse with postgres conexion

    Params:
    sales_record (dict): Row record of sales, processed before
    """
    try: 
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()

        query = """
            INSERT INTO sales (sku, site_code, quantity, date) 
            VALUES (%s, %s, %s, %s)
        """

        cursor.execute(query, (
            sales_record['sku'], 
            sales_record['site_code'], 
            sales_record['quantity'], 
            sales_record['date'].strftime('%Y-%m-%d')  # Format the date
        ))

        conn.commit()

        verify_query = """
            SELECT * FROM sales 
            WHERE site_code = %s AND sku = %s AND date = %s
        """
        cursor.execute(verify_query, (
            sales_record['site_code'],
            sales_record['sku'],
            sales_record['date'].strftime('%Y-%m-%d')  # Format the date for comparison
        ))

        inserted_record = cursor.fetchone()

        if inserted_record:
            print(f"Inserted record successfully: {inserted_record}")
        else:
            print("Record not found after insertion.")        
    except Exception as e:
        print(e)
        conn.rollback()
    finally: 
        cursor.close()
        conn.close()

### Kafka consumer for sales topic

In [6]:
consumer = Consumer({
    'bootstrap.servers': os.getenv('KAFKA_SERVER'), 
    'group.id': 'sales_consumer_group',
    'auto.offset.reset': 'earliest',
})

topic = 'sales_topic'
consumer.subscribe([topic])

In [7]:
def parse_date(date):
    date = pd.to_datetime(date)
    if date is None or pd.isna(date): 
        return pd.to_datetime('today')
    return date

In [8]:
def pipeline(data):
    """Process soh rows and handling special cases"""
    if not isinstance(data, dict):
        return dict()
    if 'date' not in data.keys(): 
        data['date'] = 'today'

    if 'quantity' not in data.keys() or data['quantity'] is None:
        data['quantity'] = 1
    
    data['date'] = parse_date(data['date'])
    
    for col in ['site_code', 'store']:
        if col in data.keys() and data[col] is not None:
            data['site_code'] = data[col].upper()
    
    data['quantity'] = max(data['quantity'], 0)
    
    return data

In [9]:
def process_message(msg):
    """
    Function to process the message of the Kafka topic and store it into a database

    Params: 
    msg (cimp.Message): Message to process
    """
    try:
        sales_record = loads(msg.value().decode('utf-8'))
    
        sales_record = pipeline(sales_record)
    
        cols = ['sku', 'date', 'quantity', 'site_code']
    
        if set(cols).issubset(sales_record.keys()) and all(sales_record[key] is not None for key in cols):
            insert_row(sales_record)
        
        print(f'Message processed: {sales_record}')
    except Exception as e:
        print(e)

In [10]:
def consume_messages():
    try:
        while True:
            msg = consumer.poll(timeout=1.0)

            if msg is None:
                continue  # No new message, keep polling

            if msg.error():
                if msg.error().code() == KafkaError._PARTITION_EOF:
                    print(f"End of partition reached {msg.partition}, offset {msg.offset}")
                else:
                    raise KafkaException(msg.error())
            else:
                process_message(msg)
                consumer.commit(message=msg, asynchronous=False)
    except KeyboardInterrupt:
        print("Consuming interrupted.")
    finally:
        consumer.close()

In [11]:
consume_messages()

Consuming interrupted.
