# Databricks Notebook for UK Railway Data Analysis

**Objective**: Analyze railway ticket transactions to uncover insights on:
- Journey performance (on-time, delayed, cancelled)
- Revenue patterns and pricing
- Station performance metrics
- Customer behavior analysis
- **Dataset**: 31,653 railway transactions with 18 features

In [0]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# initialise spark session
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()
# Read in data from Unity Catalog table
#spark_df = spark.read.table("workspace.default.railway")
# Create Pandas DataFrame
#df = spark_df.toPandas()


In [0]:
# Defined schema for the railway dataset
schema = StructType([
    StructField("Transaction_ID", StringType(), True),
    StructField("Date_of_Purchase", DateType(), True),
    StructField("Time_of_Purchase", StringType(), True),
    StructField("Purchase_Type", StringType(), True),
    StructField("Payment_Method", StringType(), True),
    StructField("Railcard", StringType(), True),
    StructField("Ticket_Class", StringType(), True),
    StructField("Ticket_Type", StringType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Departure_Station", StringType(), True),
    StructField("Arrival_Destination", StringType(), True),
    StructField("Date_of_Journey", DateType(), True),
    StructField("Departure_Time", StringType(), True),
    StructField("Arrival_Time", StringType(), True),
    StructField("Actual_Arrival_Time", StringType(), True),
    StructField("Journey_Status", StringType(), True),
    StructField("Reason_for_Delay", StringType(), True),
    StructField("Refund_Request", StringType(), True)
])

In [0]:
# Load railway data from Unity Catalog table instead of DBFS
# This avoids DBFS_DISABLED errors and uses the managed table

df_spark = spark.read.table("workspace.default.railway")

In [0]:
# Display basic info
try:
    print(f"Dataset Shape: {df_spark.count()} rows x {len(df_spark.columns)} columns")
except Exception as e:
    print("Could not count rows:", e)
    print(f"Columns: {df_spark.columns}")
df_spark.printSchema()

Data Quality Check

In [0]:
# Check for missing values
from pyspark.sql.functions import col, sum as spark_sum, when, count

missing_values = df_spark.select([
    count(when(col(c).isNull(), c)).alias(c) for c in df_spark.columns
])
missing_values.show()

Key Business Metrics

In [0]:
# Journey Status Distribution
journey_status = df_spark.groupBy("Journey Status").agg(
    count("*").alias("Count"),
    round(count("*") / df_spark.count() * 100, 2).alias("Percentage")
).orderBy(col("Count").desc())

journey_status.show()


In [0]:
# Revenue Analysis
revenue_metrics = df_spark.agg(
    sum("Price").alias("Total_Revenue"),
    avg("Price").alias("Avg_Price"),
    max("Price").alias("Max_Price"),
    min("Price").alias("Min_Price"),
)
revenue_metrics.show()

In [0]:
# Purchase Type Distribution
purchase_type = df_spark.groupby("Purchase Type").agg(count("*")).alias("count")
display(purchase_type)


In [0]:
# Payment Method Distribution
payment_method = df_spark.groupby("Payment Method").agg(count("*")).alias("count")
display(payment_method) 