In [None]:
from pyspark.sql import SparkSession
from kafka import KafkaProducer
import csv
import threading
import subprocess
from kafka import KafkaConsumer
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
import time
import json
from pyspark.sql import Row
from pyspark.sql.functions import split, col, from_json
import logging
from pyspark.sql.streaming import StreamingQueryListener
from pyspark.ml.classification import RandomForestClassificationModel

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark
!pip install kafka-python



In [None]:
# Run only once per session
!wget https://downloads.apache.org/kafka/3.7.1/kafka_2.13-3.7.1.tgz
!tar -xvf kafka_2.13-3.7.1.tgz
!rm kafka_2.13-3.7.1.tgz

--2024-12-07 00:28:58--  https://downloads.apache.org/kafka/3.7.1/kafka_2.13-3.7.1.tgz
Resolving downloads.apache.org (downloads.apache.org)... 88.99.208.237, 135.181.214.104, 2a01:4f8:10a:39da::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|88.99.208.237|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 120235344 (115M) [application/x-gzip]
Saving to: ‘kafka_2.13-3.7.1.tgz’


2024-12-07 00:29:04 (22.9 MB/s) - ‘kafka_2.13-3.7.1.tgz’ saved [120235344/120235344]

kafka_2.13-3.7.1/
kafka_2.13-3.7.1/LICENSE
kafka_2.13-3.7.1/NOTICE
kafka_2.13-3.7.1/bin/
kafka_2.13-3.7.1/bin/kafka-delete-records.sh
kafka_2.13-3.7.1/bin/trogdor.sh
kafka_2.13-3.7.1/bin/kafka-jmx.sh
kafka_2.13-3.7.1/bin/connect-mirror-maker.sh
kafka_2.13-3.7.1/bin/kafka-console-consumer.sh
kafka_2.13-3.7.1/bin/kafka-consumer-perf-test.sh
kafka_2.13-3.7.1/bin/kafka-log-dirs.sh
kafka_2.13-3.7.1/bin/kafka-metadata-quorum.sh
kafka_2.13-3.7.1/bin/zookeeper-server-stop.sh
kafka_2.13-3.7.1/bin

In [None]:
def run_zookeeper():
  !{'kafka_2.13-3.7.1/bin/zookeeper-server-start.sh kafka_2.13-3.7.1/config/zookeeper.properties'}

def run_kafka():
  !{'kafka_2.13-3.7.1/bin/kafka-server-start.sh kafka_2.13-3.7.1/config/server.properties'}

zookeeper_thread = threading.Thread(target=run_zookeeper)
kafka_thread = threading.Thread(target=run_kafka)

zookeeper_thread.start()
kafka_thread.start()


In [None]:
# Kafka Configuration
KAFKA_BROKER = "localhost:9092"
TOPIC_NAME = "transactions"

def check_and_create_topic():
    # List existing topics
    try:
        result = subprocess.run(
            ['kafka_2.13-3.7.1/bin/kafka-topics.sh', '--list', '--bootstrap-server', KAFKA_BROKER],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True
        )

        # Check if the topic exists in the list
        if TOPIC_NAME in result.stdout.splitlines():
            print(f"Topic '{TOPIC_NAME}' already exists.")
        else:
            # Create the topic if it does not exist
            print(f"Topic '{TOPIC_NAME}' does not exist. Creating topic...")
            subprocess.run(
                ['kafka_2.13-3.7.1/bin/kafka-topics.sh', '--create', '--topic', TOPIC_NAME,
                 '--bootstrap-server', KAFKA_BROKER, '--partitions', '1', '--replication-factor', '1'],
                check=True
            )
            print(f"Topic '{TOPIC_NAME}' created successfully.")

    except subprocess.CalledProcessError as e:
        print(f"Error checking or creating topic: {e}")

# Run the function to check and create the topic if necessary
check_and_create_topic()


[2024-12-07 00:40:53,022] INFO Reading configuration from: kafka_2.13-3.7.1/config/zookeeper.properties (org.apache.zookeeper.server.quorum.QuorumPeerConfig)
[2024-12-07 00:40:53,039] WARN kafka_2.13-3.7.1/config/zookeeper.properties is relative. Prepend ./ to indicate that you're sure! (org.apache.zookeeper.server.quorum.QuorumPeerConfig)
[2024-12-07 00:40:53,087] INFO clientPortAddress is 0.0.0.0:2181 (org.apache.zookeeper.server.quorum.QuorumPeerConfig)
[2024-12-07 00:40:53,090] INFO secureClientPort is not set (org.apache.zookeeper.server.quorum.QuorumPeerConfig)
[2024-12-07 00:40:53,093] INFO observerMasterPort is not set (org.apache.zookeeper.server.quorum.QuorumPeerConfig)
[2024-12-07 00:40:53,094] INFO metricsProvider.className is org.apache.zookeeper.metrics.impl.DefaultMetricsProvider (org.apache.zookeeper.server.quorum.QuorumPeerConfig)
[2024-12-07 00:40:53,110] INFO autopurge.snapRetainCount set to 3 (org.apache.zookeeper.server.DatadirCleanupManager)
[2024-12-07 00:40:53,1

In [None]:
# Kafka Configuration
KAFKA_BROKER = "localhost:9092"
TOPIC_NAME = "transactions"

csv_path = "/content/drive/MyDrive/credit-card-fraud-detection-pyspark/dataset/creditcard.csv"

producer = KafkaProducer(bootstrap_servers=KAFKA_BROKER,value_serializer=lambda v: json.dumps(v).encode('utf-8'))

spark = SparkSession.builder.appName("KafkaProducer").getOrCreate()

data = spark.read.csv(csv_path, header=True, inferSchema=True).cache()

for i in range(1, 1):
    row = data.take(i)[0]
    producer.send(TOPIC_NAME, value=row.asDict())
    time.sleep(2)
producer.flush()


+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

In [None]:
# Kafka Configuration
KAFKA_BROKER = "localhost:9092"
TOPIC_NAME = "transactions"

spark = SparkSession.builder \
    .appName("KafkaConsumer") \
    .getOrCreate()

schema = StructType([
    StructField("Time", DoubleType(), True),
    StructField("V1", DoubleType(), True),
    StructField("V2", DoubleType(), True),
    StructField("V3", DoubleType(), True),
    StructField("V4", DoubleType(), True),
    StructField("V5", DoubleType(), True),
    StructField("V6", DoubleType(), True),
    StructField("V7", DoubleType(), True),
    StructField("V8", DoubleType(), True),
    StructField("V9", DoubleType(), True),
    StructField("V10", DoubleType(), True),
    StructField("V11", DoubleType(), True),
    StructField("V12", DoubleType(), True),
    StructField("V13", DoubleType(), True),
    StructField("V14", DoubleType(), True),
    StructField("V15", DoubleType(), True),
    StructField("V16", DoubleType(), True),
    StructField("V17", DoubleType(), True),
    StructField("V18", DoubleType(), True),
    StructField("V19", DoubleType(), True),
    StructField("V20", DoubleType(), True),
    StructField("V21", DoubleType(), True),
    StructField("V22", DoubleType(), True),
    StructField("V23", DoubleType(), True),
    StructField("V24", DoubleType(), True),
    StructField("V25", DoubleType(), True),
    StructField("V26", DoubleType(), True),
    StructField("V27", DoubleType(), True),
    StructField("V28", DoubleType(), True),
    StructField("Amount", DoubleType(), True),
    StructField("Class", StringType(), True)
])

def consume_data_from_kafka():
    consumer = KafkaConsumer(
        TOPIC_NAME,
        bootstrap_servers=KAFKA_BROKER,
        auto_offset_reset='earliest',
        group_id=None
    )

    last_message_time = time.time()
    timeout = 3  # Timeout in seconds
    messages_list = []  # List to store the messages

    try:
        while True:
            records = consumer.poll(timeout_ms=1000)  # Poll every second
            if records:
                for topic_partition, messages in records.items():
                    for message in messages:
                        msg = message.value.decode('utf-8')
                        data_dict = {}
                        for item in msg.split(','):
                            key, value = item.split(':')
                            data_dict[key] = value
                        messages_list.append(data_dict)  # Collect the message data
            else:
                if time.time() - last_message_time > timeout:
                    break
    except Exception as e:
        print(f"Error: {e}")
    finally:
        consumer.close()

    if messages_list:
      rdd = spark.sparkContext.parallelize(messages_list)
      raw_df = spark.read.json(rdd)  # Parse the JSON strings into a DataFrame

      return raw_df
    else:
      print("No data received from Kafka.")
      return []


In [None]:

# Kafka and Spark Configuration
KAFKA_BROKER = "localhost:9092"
TOPIC_NAME = "transactions"
CHECKPOINT_DIR = "/content/drive/MyDrive/credit-card-fraud-detection-pyspark/checkpoints"
RETRY_ATTEMPTS = 3

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("KafkaConsumerStream")

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("KafkaConsumerFraudDetection") \
    .getOrCreate()

model_path = "/content/drive/MyDrive/credit-card-fraud-detection-pyspark/models/credit_card_fraud_detection_model"

loaded_model = RandomForestClassificationModel.load(model_path)

# Define a custom streaming query listener for monitoring
class PerformanceListener(StreamingQueryListener):
    def onQueryProgress(self, event):
        logger.info(f"Batch {event.progress.batchId} processed in {event.progress.batchDurationMs} ms.")
        logger.info(f"Input rows per second: {event.progress.inputRowsPerSecond}")
        logger.info(f"Processed rows per second: {event.progress.processedRowsPerSecond}")

    def onQueryStarted(self, event):
        logger.info(f"Query started: {event.id}")

    def onQueryTerminated(self, event):
        logger.info(f"Query terminated: {event.id}")

spark.streams.addListener(PerformanceListener())


def process_streaming_data():

    # Read data from Kafka
    transaction_data = consume_data_from_kafka()

    # Preprocess, as required
    # This is where data preprocessing can be added if necessary

    # Predict, using the model
    predictions = loaded_model.transform(transaction_data)

    predictions.show()

def start_streaming():
    # Stream Processing Logic with Error Handling and Retries
    for attempt in range(RETRY_ATTEMPTS):
        try:
            logger.info("Starting streaming process...")
            process_streaming_data()  # Process the data
            break  # Exit loop if successful
        except Exception as e:
            logger.error(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(5)  # Retry delay
    else:
        logger.critical("All retry attempts failed. Exiting application.")


# Start streaming with retry logic
start_streaming()


ERROR:kafka.consumer.fetcher:Fetch to node 0 failed: Cancelled: <BrokerConnection node_id=0 host=3f67794cf15d:9092 <connected> [IPv4 ('172.28.0.2', 9092)]>
ERROR:KafkaConsumerStream:Attempt 1 failed: scaled_features does not exist. Available: Amount, Class, Time, V1, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V2, V20, V21, V22, V23, V24, V25, V26, V27, V28, V3, V4, V5, V6, V7, V8, V9
ERROR:kafka.consumer.fetcher:Fetch to node 0 failed: Cancelled: <BrokerConnection node_id=0 host=3f67794cf15d:9092 <connected> [IPv4 ('172.28.0.2', 9092)]>
ERROR:KafkaConsumerStream:Attempt 2 failed: scaled_features does not exist. Available: Amount, Class, Time, V1, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V2, V20, V21, V22, V23, V24, V25, V26, V27, V28, V3, V4, V5, V6, V7, V8, V9
ERROR:kafka.consumer.fetcher:Fetch to node 0 failed: Cancelled: <BrokerConnection node_id=0 host=3f67794cf15d:9092 <connected> [IPv4 ('172.28.0.2', 9092)]>
ERROR:KafkaConsumerStream:Attempt 3 failed: scaled_featu