<a href="https://colab.research.google.com/github/ARUNAGIRINATHAN-K/Retail-Transaction-Analytics/blob/main/Retail_Transaction_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Remove any old broken Spark folder
!rm -rf spark-3.5.3-bin-hadoop3*

# Download Spark 3.5.3 with the correct, current URL
!wget -q https://archive.apache.org/dist/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz

# Extract
!tar -xzf spark-3.5.3-bin-hadoop3.tgz

# Install findspark + pyspark
!pip install -q findspark pyspark==3.5.3

# Install Java (quietly)
!apt-get update -qq > /dev/null
!apt-get install -y -qq openjdk-11-jdk-headless > /dev/null

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.3-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Retail150k") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "10g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("Spark is ready! Version:", spark.version)

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Spark is ready! Version: 3.5.3


In [8]:
from google.colab import files
uploaded = files.upload()  # Upload your CSV file (e.g., retail_transactions.csv)

Saving retail_personalization_dataset.csv to retail_personalization_dataset (1).csv


In [13]:
import pyspark.sql.functions as F

filename = list(uploaded.keys())[0]
df = spark.read.csv(filename, header=True, inferSchema=True)

# Fix the broken column name you mentioned
df = df.withColumnRenamed("location price", "location_price") \
       .withColumnRenamed("purchase", "purchase")

print("Rows:", df.count())
df.printSchema()
df.show(5, truncate=False)

Rows: 150000
root
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- session_id: string (nullable = true)
 |-- interaction_type: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- price: double (nullable = true)
 |-- discount: integer (nullable = true)
 |-- product_category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- user_age: integer (nullable = true)
 |-- user_gender: string (nullable = true)
 |-- loyalty_score: integer (nullable = true)
 |-- previous_purchase_count: integer (nullable = true)
 |-- avg_purchase_value: double (nullable = true)
 |-- search_keywords: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- purchase: integer (nullable = true)

+-------+----------+-------------------+----------+----------------+-----------+-----------+------+--------+----------------+-----+--------+-----------+------

In [14]:
clean_df = df \
    .withColumn("ts", F.to_timestamp("timestamp")) \
    .withColumn("date", F.to_date("ts")) \
    .withColumn("month", F.month("ts")) \
    .withColumn("revenue", F.when(F.col("purchase") == 1,
                                  F.col("price") * (1 - F.col("discount"))).otherwise(0)) \
    .filter(F.col("purchase") == 1) \
    .cache()

print("Actual purchases:", clean_df.count())

# 1. Daily sales
daily = clean_df.groupBy("date") \
    .agg(F.sum("revenue").alias("revenue"),
         F.count("*").alias("transactions")) \
    .orderBy("date")

# 2. Category monthly
cat_monthly = clean_df.groupBy("month", "product_category") \
    .agg(F.sum("revenue").alias("revenue")) \
    .orderBy("month", F.desc("revenue"))

# 3. Market Basket (FP-Growth)
from pyspark.ml.fpm import FPGrowth

basket = clean_df.groupBy("session_id") \
    .agg(F.collect_set("product_id").alias("items"))

fp = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.2)
model = fp.fit(basket)
rules = model.associationRules.orderBy(F.desc("confidence"))

# 4. Customer metrics
customer = clean_df.groupBy("user_id") \
    .agg(F.sum("revenue").alias("total_spent"),
         F.countDistinct("session_id").alias("visits"),
         F.max("ts").alias("last_purchase")) \
    .withColumn("recency_days", F.datediff(F.current_date(), "last_purchase"))

# To Pandas (instant)
daily_pd = daily.toPandas()
cat_pd = cat_monthly.toPandas()
rules_pd = rules.toPandas()
customer_pd = customer.toPandas()

Actual purchases: 7579


In [15]:
!pip install -q gradio plotly

In [16]:
import gradio as gr
import plotly.express as px

def trend():
    fig = px.area(daily_pd, x="date", y="revenue", title="Daily Revenue")
    fig.add_bar(x=daily_pd.date, y=daily_pd.transactions, name="Transactions", opacity=0.6)
    return fig

def categories(month=None):
    data = cat_pd if month is None else cat_pd[cat_pd.month == month]
    fig = px.bar(data, x="product_category", y="revenue", color="product_category",
                 title=f"Revenue by Category – Month {month or 'All'}")
    return fig

def rules_tab():
    return rules_pd.head(30)

def customers():
    fig = px.scatter(customer_pd, x="recency_days", y="total_spent",
                     size="visits", hover_data=["user_id"],
                     title="Customer Segments")
    return fig

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Retail Transaction Analytics Dashboard")
    gr.Markdown("150,000 rows • PySpark + Gradio • Fully working Nov 2025")

    with gr.Tab("Daily Trend"):
        gr.Plot(trend)

    with gr.Tab("Category Revenue"):
        gr.Plot(categories, inputs=gr.Dropdown([None]+list(range(1,13)), label="Month"))

    with gr.Tab("Market Basket Rules"):
        gr.Dataframe(rules_tab)

    with gr.Tab("Customer Segments"):
        gr.Plot(customers)

demo.launch(share=True, debug=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://290fea6ccb9cb960f2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


