# 1. Producing the data
In this task, we will implement Apache Kafka producers to simulate real-time data streaming. Spark is not allowed in this part since it’s simulating a streaming data source.  

1.	Your program should send one batch of browsing behaviour data every 5 seconds. One batch consists of a random 500-1000 rows from the browsing behaviour dataset. The CSV shouldn’t be loaded to memory at once to conserve memory (i.e. Read row as needed). Keep track of the start and end event_time. (You can assume the dataset is sorted by event_time.)  
2.	Add an integer column named ‘ts’ for each row, a Unix timestamp in seconds since the epoch. Spead your batch out evenly for 5 seconds.  
a.	For example, if you send a batch of 600 records at 2023-09-01 00:00:00 (ISO format: YYYY-MM-DD HH:MM:SS) -> (ts = 1693526400):  
-	Record 1-120: ts = 1693526400   
-	Record 121-240: ts = 1693526401   
-	Record 241-360: ts = 1693526402  
-	….  
3.	Read the transactions between the start and end event_time in 1.1 every 5 seconds (the same frequency as browsing behaviour) and create a batch.  
4.	Send your two batches from 1.1 and 1.3 to Kafka topics with an appropriate name.  
Note 1: In 1.1, “random 500-1000” means the number of rows is random, and the data file is still read sequentially.  
Note 2: All the data except for the ‘ts’ column should be sent in the original String type without changing to any other type. This is because we are simulating a streaming access log and need to reduce the required processing at the source.


In [None]:
from time import sleep
from json import dumps
from kafka3 import KafkaProducer
import random
import datetime as dt
import csv

# Configurations
hostip = "kafka"  # Replace with the correct Kafka service IP if needed

# Function to read data from the CSV file in chunks
def read_csv_chunk(file_name, batch_size):
    '''Generator to read the CSV file in batches'''
    with open(file_name, 'r') as file:
        reader = csv.DictReader(file)
        batch = []
        for row in reader:
            batch.append(row)
            if len(batch) == batch_size:
                yield batch
                batch = []
        # Yield any remaining rows in the file
        if batch:
            yield batch

# Function to filter transactions by created_at (event time) in 5-second slices
def get_transactions_in_time_range(file_name, start_event_time, end_event_time):
    '''Fetch transactions between start_event_time and end_event_time'''
    with open(file_name, 'r') as file:
        reader = csv.DictReader(file)
        batch = []
        # Convert event times to datetime for comparison
        start_time = dt.datetime.fromisoformat(start_event_time)
        end_time = dt.datetime.fromisoformat(end_event_time)
        for row in reader:
            row_time = dt.datetime.fromisoformat(row['created_at'])
            if start_time <= row_time <= end_time:
                batch.append(row)
                if len(batch) >= 1000:
                    break
        return batch

# Function to publish a message to the Kafka topic
def publish_message(producer_instance, topic_name, data_batch):
    try:
        for data in data_batch:  # Send each row as a separate Kafka message
            producer_instance.send(topic_name, value=data)
        print(f'Message batch published to {topic_name}. Batch size: {len(data_batch)}')
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))

# Function to connect to the Kafka producer
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(
            bootstrap_servers=[f'{hostip}:9092'],
            value_serializer=lambda x: dumps(x).encode('utf-8'),
            api_version=(0, 10)
        )
    except Exception as ex:
        print('Exception while connecting to Kafka.')
        print(str(ex))
    finally:
        return _producer

# Function to simulate streaming data from CSV to Kafka
def simulate_streaming(browsing_file, transactions_file, producer):
    while True:
        # 1. Browsing behavior batch
        batch_size = random.randint(500, 1000)
        browsing_batch = next(read_csv_chunk(browsing_file, batch_size))
        
        # 1.1 Get the start and end event_time from browsing behavior
        start_event_time = browsing_batch[0]['event_time']
        end_event_time = browsing_batch[-1]['event_time']
        
        # Unix timestamp for start event time
        base_ts = int(dt.datetime.now().timestamp())
        
        # Add 'ts' column and spread evenly over 5 seconds
        for i, row in enumerate(browsing_batch):
            row['ts'] = base_ts + (i * 5 // len(browsing_batch))
            # Ensure all other data is in string format (except 'ts')
            for key in row:
                if key != 'ts':
                    row[key] = str(row[key])

        # Send browsing batch to Kafka
        publish_message(producer, 'browsing_behavior', browsing_batch)

        # 2. Transactions batch 
        transactions_batch = get_transactions_in_time_range(transactions_file, start_event_time, end_event_time)
        
        # Ensure all transaction data remains in string format
        for row in transactions_batch:
            for key in row:
                row[key] = str(row[key])

        # Send transactions batch to Kafka
        publish_message(producer, 'transactions', transactions_batch)

        # Sleep for 5 seconds before sending the next batch
        sleep(5)

if __name__ == "__main__":
    # Files to be streamed
    browsing_file = "new_browsing_behaviour.csv"
    transactions_file = "new_transactions.csv"
    
    # Connect to Kafka producer
    producer = connect_kafka_producer()

    if producer is not None:
        print("Starting streaming...")
        simulate_streaming(browsing_file, transactions_file, producer)
    else:
        print("Failed to connect to Kafka producer")


Starting streaming...
Message batch published to browsing_behavior. Batch size: 665
Message batch published to transactions. Batch size: 45
Message batch published to browsing_behavior. Batch size: 934
Message batch published to transactions. Batch size: 71
Message batch published to browsing_behavior. Batch size: 599
Message batch published to transactions. Batch size: 39
Message batch published to browsing_behavior. Batch size: 910
Message batch published to transactions. Batch size: 67
Message batch published to browsing_behavior. Batch size: 887
Message batch published to transactions. Batch size: 67
Message batch published to browsing_behavior. Batch size: 774
Message batch published to transactions. Batch size: 52
Message batch published to browsing_behavior. Batch size: 636
Message batch published to transactions. Batch size: 42
Message batch published to browsing_behavior. Batch size: 807
Message batch published to transactions. Batch size: 54
Message batch published to browsin

Message batch published to transactions. Batch size: 52
Message batch published to browsing_behavior. Batch size: 863
Message batch published to transactions. Batch size: 62
Message batch published to browsing_behavior. Batch size: 733
Message batch published to transactions. Batch size: 52
Message batch published to browsing_behavior. Batch size: 720
Message batch published to transactions. Batch size: 50
Message batch published to browsing_behavior. Batch size: 853
Message batch published to transactions. Batch size: 62
Message batch published to browsing_behavior. Batch size: 862
Message batch published to transactions. Batch size: 62
Message batch published to browsing_behavior. Batch size: 820
Message batch published to transactions. Batch size: 56
Message batch published to browsing_behavior. Batch size: 753
Message batch published to transactions. Batch size: 52
Message batch published to browsing_behavior. Batch size: 770
Message batch published to transactions. Batch size: 52


Message batch published to browsing_behavior. Batch size: 994
Message batch published to transactions. Batch size: 74
Message batch published to browsing_behavior. Batch size: 558
Message batch published to transactions. Batch size: 38
Message batch published to browsing_behavior. Batch size: 998
Message batch published to transactions. Batch size: 75
Message batch published to browsing_behavior. Batch size: 880
Message batch published to transactions. Batch size: 65
Message batch published to browsing_behavior. Batch size: 816
Message batch published to transactions. Batch size: 56
Message batch published to browsing_behavior. Batch size: 845
Message batch published to transactions. Batch size: 61
Message batch published to browsing_behavior. Batch size: 654
Message batch published to transactions. Batch size: 43
Message batch published to browsing_behavior. Batch size: 985
Message batch published to transactions. Batch size: 74
Message batch published to browsing_behavior. Batch size