[Reference](https://towardsdev.com/building-a-real-time-log-monitoring-system-with-kafka-and-spark-streaming-51b75ac8db04)

# Step 1: Building a Kafka Producer

In [1]:
import os
import time
from confluent_kafka import Producer
import json

# Setup configurations
data_dir = "/path/to/log/files"  # directory path containing the log files
kafka_broker = "localhost:9092"  # Kafka broker
kafka_topic = "log-topic"  # Kafka topic

# Create a producer to send data to Kafka
producer = Producer({
    'bootstrap.servers': kafka_broker,
    'queue.buffering.max.messages': 10000000,  # Set the desired queue size
    'compression.type': 'zstd'  # Or 'snappy', 'lz4', 'zstd'
})

def process_file(filepath):
    with open(filepath, 'r') as file:
        for line in file:
            # Create a message to be sent to Kafka
            message = {
                'log_line': line.strip()  # Remove any trailing newline characters
            }

            # Serialize the message to JSON
            message_json = json.dumps(message)

            # Send the message to Kafka
            producer.produce(topic=kafka_topic, value=message_json)

        # Close the producer
        producer.flush()

# Get the list of files already processed
processed_files = set()

while True:
    # List all log files in the directory
    files = [f for f in os.listdir(data_dir) if f.endswith('.log')]

    # Process any new files
    for file in files:
        if file not in processed_files:
            process_file(os.path.join(data_dir, file))
            processed_files.add(file)

    # Wait for a while before checking the directory again
    time.sleep(10)

# Step 2: Building the Spark Streaming Application with Email Alerts

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType, StringType
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import json

spark = SparkSession.builder.appName("Log Monitor").getOrCreate()

# Define the UDF to search for 'ERROR' keyword
@udf(returnType=BooleanType())
def contains_error(line):
    line_json = json.loads(line)
    return 'ERROR' in line_json['log_line']

# Send email alert
@udf(returnType=StringType())
def send_email_alert(line):
    error_line = json.loads(line)['log_line']

    # Email content
    msg = MIMEMultipart()
    msg['From'] = 'your-email@gmail.com'
    msg['To'] = 'receiver-email@gmail.com'
    msg['Subject'] = 'Log Monitor Alert'
    message = f'ERROR found: {error_line}'
    msg.attach(MIMEText(message))

    # Send the email
    mailserver = smtplib.SMTP('smtp.gmail.com', 587)
    mailserver.ehlo()
    mailserver.starttls()
    mailserver.login('your-email@gmail.com', 'your-password')
    mailserver.sendmail('your-email@gmail.com', 'receiver-email@gmail.com', msg.as_string())
    mailserver.quit()

    return 'Email Alert sent.'

# Subscribe to Kafka topic
df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "log-topic").load()

# Filter the lines that contain 'ERROR'
errorLines = df.filter(contains_error(col("value")))

# Send email alerts
alert = errorLines.withColumn("Alert", send_email_alert(col("value")))

# Write out the 'ERROR' lines and Alert status to console
query = alert.writeStream.outputMode("append").format("console").start()

query.awaitTermination()