<a href="https://colab.research.google.com/github/ARUNAGIRINATHAN-K/Retail-Transaction-Analytics/blob/main/Retail_Transaction_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Remove any old broken Spark folder
!rm -rf spark-3.5.3-bin-hadoop3*

# Download Spark 3.5.3 with the correct, current URL
!wget -q https://archive.apache.org/dist/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz

# Extract
!tar -xzf spark-3.5.3-bin-hadoop3.tgz

# Install findspark + pyspark
!pip install -q findspark pyspark==3.5.3

# Install Java (quietly)
!apt-get update -qq > /dev/null
!apt-get install -y -qq openjdk-11-jdk-headless > /dev/null

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.3-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Retail150k") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "10g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("Spark is ready! Version:", spark.version)

In [None]:
from google.colab import files
uploaded = files.upload()  # Upload your CSV file (e.g., retail_transactions.csv)

Saving retail_personalization_dataset.csv to retail_personalization_dataset (1).csv


In [None]:
import pyspark.sql.functions as F

filename = list(uploaded.keys())[0]
df = spark.read.csv(filename, header=True, inferSchema=True)

# Fix the broken column name you mentioned
df = df.withColumnRenamed("location price", "location_price") \
       .withColumnRenamed("purchase", "purchase")

print("Rows:", df.count())
df.printSchema()
df.show(5, truncate=False)

Rows: 150000
root
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- session_id: string (nullable = true)
 |-- interaction_type: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- price: double (nullable = true)
 |-- discount: integer (nullable = true)
 |-- product_category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- user_age: integer (nullable = true)
 |-- user_gender: string (nullable = true)
 |-- loyalty_score: integer (nullable = true)
 |-- previous_purchase_count: integer (nullable = true)
 |-- avg_purchase_value: double (nullable = true)
 |-- search_keywords: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- purchase: integer (nullable = true)

+-------+----------+-------------------+----------+----------------+-----------+-----------+------+--------+----------------+-----+--------+-----------+------

In [None]:
# CELL 4 – FIXED & ROBUST ANALYTICS (replace your old Cell 4 completely)

from pyspark.ml.fpm import FPGrowth
import pyspark.sql.functions as F

# --- ETL (same as before) ---
clean_df = df \
    .withColumn("ts", F.to_timestamp("timestamp")) \
    .withColumn("date", F.to_date("ts")) \
    .withColumn("month", F.month("ts")) \
    .withColumn("revenue", F.when(F.col("purchase") == 1,
                                  F.col("price") * (1 - F.col("discount"))).otherwise(0)) \
    .filter(F.col("purchase") == 1) \
    .cache()

print(f"Total purchases: {clean_df.count():,}")

# 1. Daily sales
daily = clean_df.groupBy("date") \
    .agg(F.sum("revenue").alias("revenue"),
         F.count("*").alias("transactions")) \
    .orderBy("date")

# 2. Category monthly
cat_monthly = clean_df.groupBy("month", "product_category") \
    .agg(F.sum("revenue").alias("revenue")) \
    .orderBy("month", F.desc("revenue"))

# 3. MARKET BASKET – FIXED VERSION (this is the important part)
basket_raw = clean_df.groupBy("session_id") \
    .agg(F.collect_set("product_id").alias("items")) \
    .filter(F.size("items") >= 2)                     # ← only sessions with 2+ items

print(f"Sessions with 2+ different products: {basket_raw.count()} out of {clean_df.select('session_id').distinct().count()} total sessions")

fp = FPGrowth(itemsCol="items", minSupport=0.002, minConfidence=0.1)  # ← lowered thresholds
model = fp.fit(basket_raw)

rules = model.associationRules.orderBy(F.desc("confidence"), F.desc("lift"))
freq = model.freqItemsets.filter(F.size("items") > 1).orderBy(F.desc("freq"))

print(f"Association rules found: {rules.count()}")
print(f"Frequent itemsets ≥2 found: {freq.count()}")

# 4. Customer metrics
customer = clean_df.groupBy("user_id") \
    .agg(F.sum("revenue").alias("total_spent"),
         F.countDistinct("session_id").alias("visits"),
         F.max("ts").alias("last_purchase")) \
    .withColumn("recency_days", F.datediff(F.current_date(), "last_purchase"))

# Convert to Pandas
daily_pd = daily.toPandas()
cat_pd = cat_monthly.toPandas()
rules_pd = rules.toPandas() if rules.count() > 0 else None
freq_pd = freq.toPandas() if freq.count() > 0 else None
customer_pd = customer.toPandas()

Total purchases: 7,579
Sessions with 2+ different products: 1773 out of 5327 total sessions
Association rules found: 0
Frequent itemsets ≥2 found: 0


In [None]:
!pip install -q gradio plotly

In [None]:
# CELL 6 – UPDATED DASHBOARD (handles empty basket results gracefully)

import gradio as gr
import plotly.express as px
import pandas as pd

def trend():
    fig = px.area(daily_pd, x="date", y="revenue", title="Daily Revenue Trend")
    fig.add_bar(x=daily_pd.date, y=daily_pd.transactions, name="Transactions", opacity=0.7)
    return fig

def categories(month=None):
    data = cat_pd if month is None else cat_pd[cat_pd.month == month]
    fig = px.bar(data, x="product_category", y="revenue", color="product_category",
                 title=f"Revenue by Category – Month {month or 'All'}")
    return fig

def get_market_basket_rules_data(): # Renamed function to clarify it returns data
    if rules_pd is None or len(rules_pd) == 0:
        return pd.DataFrame({
            "Message": ["No strong association rules found",
                        "This is normal! Most real sessions have only 1 item",
                        "See the 'Co-purchased Pairs' tab instead"]
        })
    else:
        return rules_pd.head(30)[["antecedent", "consequent", "confidence", "lift", "support"]]

def get_frequent_pairs_data(): # Renamed function for consistency
    if freq_pd is None or len(freq_pd) == 0:
        # Return an empty DataFrame or a DataFrame with a message for consistency with gr.Dataframe
        return pd.DataFrame({"Message": ["No frequent pairs found"]})
    return freq_pd.head(30)

# Simple co-purchase count (always works)
pairs_simple = clean_df.groupBy("session_id") \
    .agg(F.collect_set("product_id").alias("items")) \
    .filter(F.size("items") > 1) \
    .select(F.explode("items").alias("p1"), "items") \
    .select("p1", F.explode("items").alias("p2")) \
    .filter("p1 < p2") \
    .groupBy("p1","p2").count() \
    .orderBy(F.desc("count"))
top_pairs_pd = pairs_simple.limit(50).toPandas()

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Retail Transaction Analytics Dashboard")

    with gr.Tab("Daily Trend"):
        gr.Plot(trend)

    with gr.Tab("Category Revenue"):
        month = gr.Dropdown([None]+list(range(1,13)), label="Select Month")
        gr.Plot(categories, inputs=month)

    with gr.Tab("Association Rules (FP-Growth)"):
        gr.Dataframe(value=get_market_basket_rules_data())

    with gr.Tab("Top Co-Purchased Pairs (Always Works)"):
        gr.Markdown("Simple count of products bought together in the same session")
        gr.Dataframe(top_pairs_pd)

    with gr.Tab("Customer Segments"):
        fig = px.scatter(customer_pd, x="recency_days", y="total_spent",
                         size="visits", color="visits", hover_data=["user_id"],
                         title="Customers: Recency vs Total Spent")
        gr.Plot(fig)

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f55a616298e4fabd5b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


