In [8]:
import json
import time

from kafka import KafkaProducer
from pyspark.sql import SparkSession

from logs.CustomLogger import CustomLogger

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Logger
We will set up a logger to log the events. This will help us debug the application and understand the flow of events.  
Logging can also prove very useful in case of errors and providing fault tolerance.

The logger is set up using the `CustomLogger` class from the `logs` package.
Every time the file is run, it creates a new log file with the current timestamp.

In [9]:
logger = CustomLogger('KafkaDistributedProducer')

Logger is set up. Check producer.log for logs.


# Producing data
Now that Kafka and Zookeeper are running, we can start producing data.

In [10]:
spark = SparkSession.builder.appName("KafkaProducer").getOrCreate()

In [11]:
producer = KafkaProducer(bootstrap_servers='localhost:9092', value_serializer=lambda v: json.dumps(v).encode('utf-8'))
topic = 'distributed_transactions'

In [12]:
data = spark.read.csv("./../dataset/creditcard.csv", header=True, inferSchema=True).cache()
data.show(5)

# Log the schema
logger.info(f"Schema: {data.schema}")

# Log the first 5 rows
logger.info(f"First 5 rows: {data.limit(5).collect()}")

24/12/06 20:42:54 WARN CacheManager: Asked to cache already cached data.        


+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

- [x] Spark session is created
- [x] Kafka producer is created
- [x] Data is read from the CSV file
- [x] Data is shown

Now, we will send the data to the Kafka topic.

# Sending data to Kafka
## Balancing the data
Given that the dataset is highly imbalanced, and also that we are working with a small subset of data (for testing), we will balance the data before sending it to Kafka.
This would ensure that the consumer receives a balanced dataset for prediction.

In [13]:
from pyspark.sql.functions import col

# Assuming your original DataFrame is named 'df' and the class column is named 'Class'

# Separate the DataFrame into two classes
class_0 = data.filter(col("Class") == 0)
class_1 = data.filter(col("Class") == 1)

# Calculate the number of rows to sample from each class
rows_per_class = 50

# Sample from each class
class_0_sampled = class_0.sample(withReplacement=True, fraction=rows_per_class/class_0.count(), seed=42)
class_1_sampled = class_1.sample(withReplacement=True, fraction=rows_per_class/class_1.count(), seed=42)

# Union the sampled DataFrames
balanced_df = class_0_sampled.union(class_1_sampled)

# Verify the balance and total count
print(balanced_df.groupBy("Class").count().show())
print(f"Total rows: {balanced_df.count()}")

# Log the balanced DataFrame
logger.info(f"Balanced DataFrame: {balanced_df.groupBy('Class').count().limit(5).collect()}")

+-----+-----+
|Class|count|
+-----+-----+
|    0|   43|
|    1|   60|
+-----+-----+

None
Total rows: 103


## Sending the data
Now that we have a balanced dataset, we can send it to the Kafka topic via the `KafkaProducer`.

In [14]:
counter = 0
for row in balanced_df.collect():
    row_dict = row.asDict()
    logger.info(f"Sending row {row_dict} {counter}")
    counter += 1
    producer.send(topic, value=row_dict)
    time.sleep(1)
producer.flush()
producer.close()