In [None]:
2.How many fatal log entries that occurred on a Tuesday or Thursday resulted from a ”machine check interrupt”?

In [10]:
#!/usr/bin/env python

import datetime
from pyspark.sql import SparkSession

# Configuration variables
LOG_FILE_PATH = "file:///home/hduser/notebooks/BGL.log"  # Update the path to the log file as needed
APP_NAME = "FatalLogCount"  # Name of the Spark application
SPARK_MASTER = "local[*]"  # Spark master configuration
SPARK_DRIVER_HOST = "localhost"  # Spark driver host

# Initialize Spark session
spark = SparkSession.builder \
    .appName(APP_NAME) \
    .config("spark.driver.host", SPARK_DRIVER_HOST) \
    .master(SPARK_MASTER) \
    .getOrCreate()

# Load the log file into an RDD (Resilient Distributed Dataset)
log_rdd = spark.sparkContext.textFile(LOG_FILE_PATH)

# Function to check if the log entry is fatal and occurs on Tuesday or Thursday due to a machine check interrupt
def extract_fatal_errors(line):
    columns = line.split()  # Split the log line into columns
    if len(columns) > 5:  # Ensure there are enough columns to process
        date_str = columns[2]  # Extract the date string from the log entry
        message = " ".join(columns[6:])  # Extract the log message
        try:
            date = datetime.datetime.strptime(date_str, "%Y.%m.%d")  # Parse the date string
            day_of_week = date.weekday()  # Get the day of the week (0 = Monday, 6 = Sunday)
            if day_of_week in [1, 3]:  # Check if it's Tuesday (1) or Thursday (3)
                if "machine check interrupt" in message and "FATAL" in message:  # Check if the message contains specific keywords
                    return 1  # Return 1 for valid fatal errors
        except ValueError:
            return None  # Return None if there's a date parsing error
    return None  # Return None if conditions are not met

# Extract relevant data by mapping and filtering the log entries
fatal_error_rdd = log_rdd.map(extract_fatal_errors).filter(lambda x: x is not None)

# Count the number of fatal log entries that meet the criteria
fatal_error_count = fatal_error_rdd.count()

# Print the count of fatal log entries
print(f"Number of fatal log entries on Tuesday or Thursday due to a machine check interrupt: {fatal_error_count}")

# Stop the Spark session
spark.stop()


24/07/29 14:53:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

Number of fatal log entries on Tuesday or Thursday due to a machine check interrupt: 83


In [None]:
7.For each week, what is the average number of times ”ddr errors” were detected and corrected? Assume a week runs
from Monday to Sunday

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, weekofyear, count, avg, split

# Initialize Spark session
spark = SparkSession.builder \
    .appName("LogAnalysis") \
    .config("spark.driver.host", "localhost") \
    .master("local[*]") \
    .getOrCreate()

# Read the log file into a DataFrame
df_logs = spark.read.text("BGL.log")

# Define a function to parse the date and message from log lines
def parse_log_line(line):
    parts = line.split(" ")
    if len(parts) > 4:
        date_str = parts[2]  # Extract the date part from the line
        message = " ".join(parts[3:])  # Combine the rest as the message
        return (date_str, message)
    return None

# Register UDF (User Defined Function) to parse log lines
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, StructType, StructField, DateType

# Define the schema for the parsed data
schema = StructType([
    StructField("date", StringType(), True),
    StructField("message", StringType(), True)
])

# Create a UDF with the defined schema
parse_log_udf = udf(lambda line: parse_log_line(line), schema)

# Apply the UDF to create a structured DataFrame with 'date' and 'message' columns
df_parsed = df_logs.select(parse_log_udf(col("value")).alias("parsed")) \
    .select("parsed.*") \
    .filter(col("message").contains("ddr errors"))  # Filter rows containing 'ddr errors'

# Convert 'date' column to DateType and extract the week of the year
df_parsed = df_parsed.withColumn("date", col("date").cast(DateType()))

# Group by week of the year and count occurrences of DDR errors
df_weekly_ddr_errors = df_parsed.groupBy(weekofyear(col('date')).alias('week')) \
    .agg(count('*').alias('ddr_error_count'))

# Calculate the average number of DDR errors per week
df_weekly_avg = df_weekly_ddr_errors.agg(avg('ddr_error_count').alias('avg_ddr_errors_per_week'))

# Show the result
df_weekly_avg.show()

# Stop the Spark session
spark.stop()


                                                                                

+-----------------------+
|avg_ddr_errors_per_week|
+-----------------------+
|                33939.0|
+-----------------------+



In [None]:
10. What are the top 3 most frequently occurring days of the week in the log?

In [12]:
#!/usr/bin/env python

import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Configuration variables
LOG_FILE_PATH = "file:///home/hduser/notebooks/BGL.log"  # Path to the log file
APP_NAME = "TopDaysOfWeek"  
SPARK_MASTER = "local[*]"  
SPARK_DRIVER_HOST = "localhost"  # Spark driver host

# Initialize Spark session
spark = SparkSession.builder \
    .appName(APP_NAME) \
    .config("spark.driver.host", SPARK_DRIVER_HOST) \
    .master(SPARK_MASTER) \
    .getOrCreate()

# Read the log file into an RDD
log_rdd = spark.sparkContext.textFile(LOG_FILE_PATH)

# Function to extract day of the week from log entry
def extract_day_of_week(line):
    columns = line.split()  # Split the line into columns
    if len(columns) > 5:  # Check if the line has more than 5 columns
        date_str = columns[2]  # Extract the date string
        try:
            # Parse the date string into a datetime object
            date = datetime.datetime.strptime(date_str, "%Y.%m.%d")
            # Get the full weekday name (e.g., 'Monday')
            day_of_week = date.strftime('%A')
            return (day_of_week, 1)  # Return a tuple of the day and a count of 1
        except ValueError:
            return None  # Return None if date parsing fails
    return None  # Return None if the line does not have more than 5 columns

# Extract day of the week from each log entry and filter out None values
days_rdd = log_rdd.map(extract_day_of_week).filter(lambda x: x is not None)

# Count occurrences of each day of the week
day_counts_rdd = days_rdd.reduceByKey(lambda x, y: x + y)

# Get the top 3 most frequently occurring days
top_days = day_counts_rdd.takeOrdered(3, key=lambda x: -x[1])

# Convert the result to a DataFrame
top_days_df = spark.createDataFrame(top_days, ["day_of_week", "count"])

# Print the message
print("Top 3 most frequently occurring days of the week:")

# Show the result in the required format
top_days_df.orderBy(col("count").desc()).show()

# Stop the Spark session
spark.stop()


                                                                                

Top 3 most frequently occurring days of the week:
+-----------+------+
|day_of_week| count|
+-----------+------+
|   Saturday|932934|
|   Thursday|914707|
|    Tuesday|829180|
+-----------+------+



In [None]:
14. Which node generated the largest number of KERNRTSP events?

In [13]:
#!/usr/bin/env python

import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract

# Configuration variables
LOG_FILE_PATH = "file:///home/hduser/notebooks/BGL.log"  # Path to the log file
APP_NAME = "NodeWithMaxKERNRTSP"
SPARK_MASTER = "local[*]"
SPARK_DRIVER_HOST = "localhost"

# Initialize Spark session
spark = SparkSession.builder \
    .appName(APP_NAME) \
    .config("spark.driver.host", SPARK_DRIVER_HOST) \
    .master(SPARK_MASTER) \
    .getOrCreate()

# Read the log file into an RDD
log_rdd = spark.sparkContext.textFile(LOG_FILE_PATH)

# Function to extract node from KERNRTSP log entry
def extract_node_from_kernrtsp(line):
    if 'KERNRTSP' in line:  # Check if the line contains 'KERNRTSP'
        columns = line.split()  # this split the line into columns
        if len(columns) > 3:  # it ensure there are enough columns
            node = columns[3]  # Extract the node information
            return (node, 1)  # Return a tuple with node and count 1
    return None  # Return None if not a valid KERNRTSP entry

# Extract nodes from KERNRTSP log entries
nodes_rdd = log_rdd.map(extract_node_from_kernrtsp).filter(lambda x: x is not None)

# Count occurrences of KERNRTSP events per node
node_counts_rdd = nodes_rdd.reduceByKey(lambda x, y: x + y)

# Find the node with the maximum KERNRTSP events
node_with_max_kernrtsp = node_counts_rdd.takeOrdered(1, key=lambda x: -x[1])

# Convert the result to a DataFrame
node_with_max_kernrtsp_df = spark.createDataFrame(node_with_max_kernrtsp, ["node", "count"])

# Print the result
print("Node with the largest number of KERNRTSP events:")
node_with_max_kernrtsp_df.show()

# Stop the Spark session
spark.stop()


                                                                                

Node with the largest number of KERNRTSP events:
+-------------------+-----+
|               node|count|
+-------------------+-----+
|R63-M0-NE-C:J12-U01|   22|
+-------------------+-----+



In [None]:
18. On which date and time was the earliest fatal kernel error where the message contains ”Lustre mount FAILED”?

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract, to_timestamp

# Initialize SparkSession with specific configurations for memory and cores
spark = SparkSession.builder \
    .appName("BGLLogAnalysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

# Read the log data from the text file
log_data = spark.read.text("BGL.log")

# Define regex patterns for "Lustre mount FAILED" and "FATAL"
lustre_mount_failed_pattern = r'(?i)Lustre mount FAILED'
fatal_pattern = r'(?i)FATAL'

# Filter log entries that contain both "Lustre mount FAILED" and "FATAL"
fatal_lustre_errors = log_data.filter(
    col("value").rlike(lustre_mount_failed_pattern) & col("value").rlike(fatal_pattern)
)

# this section will extract datetime from the log entries and convert to timestamp
fatal_lustre_errors = fatal_lustre_errors.withColumn(
    "datetime", regexp_extract(col("value"), r'(\d{4}-\d{2}-\d{2}-\d{2}\.\d{2}\.\d{2})', 1)
).withColumn(
    "datetime", to_timestamp(col("datetime"), "yyyy-MM-dd-HH.mm.ss")
)

# Find the earliest datetime entry from the filtered log entries
earliest_error = fatal_lustre_errors.orderBy("datetime").first()

# Print the earliest fatal error datetime, if any
if earliest_error:
    print(f"Earliest fatal kernel error with 'Lustre mount FAILED': {earliest_error['datetime']}")
else:
    print("No fatal kernel errors with 'Lustre mount FAILED' found.")




Earliest fatal kernel error with 'Lustre mount FAILED': 2005-08-03 15:35:34


                                                                                