# 🔍 Log Analysis with PySpark
This notebook analyzes log data stored in HDFS and extracts meaningful insights using PySpark.

In [None]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, regexp_extract, to_timestamp, hour

# Create Spark Session
spark = SparkSession.builder \
    .appName("Log Analysis") \
    .getOrCreate()


## 📥 Load Data from HDFS

In [None]:

# Read logs from HDFS using the correct full path
rdd = spark.sparkContext.textFile("hdfs://localhost:9000/tmp/logGenED/25-07-19/*")
logs = rdd.toDF(["value"])
logs.show(5, truncate=False)


## 🔄 Split and Structure Log Lines

In [None]:

# Split each log line into structured columns
logs_df = logs.withColumn("parts", split(col("value"), " - ")) \
    .withColumn("timestamp", col("parts").getItem(0)) \
    .withColumn("level", col("parts").getItem(1)) \
    .withColumn("component", col("parts").getItem(2)) \
    .withColumn("message", col("parts").getItem(3)) \
    .drop("parts")
logs_df.show(5, truncate=False)


## 📊 Extract and Analyze Response Times

In [None]:

# Extract response time in ms from message
logs_df = logs_df.withColumn("response_ms", regexp_extract("message", r"(\d+)ms", 1).cast("int"))
logs_df.cache()


## 📈 Count Logs by Level

In [None]:

logs_df.groupBy("level").count().show()


## 📊 Count Logs by Component

In [None]:

logs_df.groupBy("component").count().orderBy("count", ascending=False).show()


## ⚡ Average Response Time per Component

In [None]:

logs_df.groupBy("component").avg("response_ms").withColumnRenamed("avg(response_ms)", "avg_ms").show()


## 🕒 Log Distribution by Hour

In [None]:

logs_df = logs_df.withColumn("ts", to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss"))
logs_df.groupBy(hour("ts").alias("hour")).count().orderBy("hour").show()
