<a href="https://colab.research.google.com/github/ARUNAGIRINATHAN-K/Retail-Transaction-Analytics/blob/main/Retail_transaction_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# STEP 1: Install PySpark and ngrok in Colab
!pip install pyspark pyngrok -q

# STEP 2: Setup ngrok (get your authtoken from https://dashboard.ngrok.com/get-started/your-authtoken)
NGROK_AUTH_TOKEN = "36ezKIxPMhSIslCTqCaIAV8Od8M_5LfejcBq9MtTBb2ZZ7sGJ"  # ‚Üê CHANGE THIS
!ngrok authtoken $NGROK_AUTH_TOKEN

# STEP 3: Initialize Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Retail Basket Analysis") \
    .master("local[*]") \
    .config("spark.ui.port", "4050") \
    .getOrCreate()

spark

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [2]:
# Option A: Upload CSV manually
from google.colab import files
uploaded = files.upload()  # upload your transactions.csv

# Option B: Use sample public dataset (Online Retail or Instacart-like)
!wget -q https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/master/data/retail-data/all/online-retail-dataset.csv

import pandas as pd
df_pandas = pd.read_csv("online-retail-dataset.csv")
# Or your uploaded file:
# df_pandas = pd.read_csv("your_retail_transactions.csv")

# Convert to Spark DataFrame
df = spark.createDataFrame(df_pandas)
df.show(5)

Saving retail_personalization_dataset.csv to retail_personalization_dataset.csv
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+---------

In [3]:
from pyspark.sql.functions import collect_set, col, explode, split, regexp_replace

# Clean and prepare basket format
basket_df = df \
    .withColumn("ItemsArray", split(regexp_replace(col("Description"), "[^a-zA-Z0-9 ]", ""), ",\\s*")) \
    .filter(col("ItemsArray").isNotNull()) \
    .groupBy("InvoiceNo") \
    .agg(collect_set("ItemsArray").alias("items"))

basket_df.show(5, truncate=False)

+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|InvoiceNo|items                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [4]:
from pyspark.ml.fpm import FPGrowth

fp = FPGrowth(itemsCol="items", minSupport=0.02, minConfidence=0.5)
model = fp.fit(basket_df)

# Most frequent itemsets
freq_itemsets = model.freqItemsets
freq_itemsets.orderBy(desc("freq")).show(20, truncate=False)

# Association Rules
rules = model.associationRules
rules.orderBy(desc("confidence")).show(20, truncate=False)

+-------------------------------------+----+
|items                                |freq|
+-------------------------------------+----+
|[[WHITE HANGING HEART TLIGHT HOLDER]]|2302|
|[[REGENCY CAKESTAND 3 TIER]]         |2169|
|[[JUMBO BAG RED RETROSPOT]]          |2135|
|[[PARTY BUNTING]]                    |1706|
|[[LUNCH BAG RED RETROSPOT]]          |1607|
|[[ASSORTED COLOUR BIRD ORNAMENT]]    |1467|
|[[SET OF 3 CAKE TINS PANTRY DESIGN ]]|1458|
|[[NaN]]                              |1454|
|[[PACK OF 72 RETROSPOT CAKE CASES]]  |1334|
|[[LUNCH BAG  BLACK SKULL]]           |1295|
|[[NATURAL SLATE HEART CHALKBOARD ]]  |1266|
|[[POSTAGE]]                          |1250|
|[[JUMBO BAG PINK POLKADOT]]          |1231|
|[[JAM MAKING SET WITH JARS]]         |1220|
|[[HEART OF WICKER SMALL]]            |1212|
|[[JUMBO STORAGE BAG SUKI]]           |1201|
|[[JUMBO SHOPPER VINTAGE RED PAISLEY]]|1187|
|[[JAM MAKING SET PRINTED]]           |1174|
|[[LUNCH BAG CARS BLUE]]              |1173|
|[[PAPER C

In [14]:
%%writefile app.py
import streamlit as st
import pandas as pd
from pyspark.sql import SparkSession

# Re-create Spark session inside Streamlit (or pass data)
spark = SparkSession.builder.appName("Dashboard").getOrCreate()

# Load pre-computed results (these files must exist from a previous step)
freq_df = spark.read.parquet("freq_itemsets.parquet").toPandas()
rules_df = spark.read.parquet("rules.parquet").toPandas()

st.title("Retail Transaction & Basket Analysis Dashboard")

tab1, tab2, tab3 = st.tabs(["Sales Overview", "Frequent Itemsets", "Association Rules"])

with tab1:
    st.header("Top Selling Products")
    # Add your sales charts here
    st.bar_chart(freq_df.head(20).set_index("items")["freq"])

with tab2:
    st.header("Top Frequent Itemsets")
    st.dataframe(freq_df.head(50), width='stretch')

with tab3:
    st.header("Strongest Association Rules")
    st.dataframe(rules_df.sort_values("confidence", ascending=False).head(50),
                 width='stretch')
    st.write("Example: If someone buys X ‚Üí likely to buy Y")

Overwriting app.py


In [8]:
# The saving logic has been moved to the Streamlit app cell (MNYNIT1KJVC7) to ensure correct execution order.

In [18]:
from pyngrok import ngrok
import subprocess
import time

# Kill any old tunnels
!killall ngrok 2>/dev/null

# Start Streamlit in background and redirect output to a log file
get_ipython().system_raw('streamlit run app.py --server.enableCORS=True --server.enableXsrfProtection=False > streamlit.log 2>&1 &')

# Wait a bit for Streamlit to start (increased time)
time.sleep(15)

# Create public tunnel, connecting to Streamlit's actual port (8504)
public_url = ngrok.connect(8504)
print("üöÄ Your Dashboard is LIVE here:")
print(public_url)

# Display Streamlit logs to help debug if connection still fails
print("\n--- Streamlit Log (for debugging) ---")
!cat streamlit.log
print("-----------------------------------")

üöÄ Your Dashboard is LIVE here:
NgrokTunnel: "https://barbara-rhotic-ignacia.ngrok-free.dev" -> "http://localhost:8503"

--- Streamlit Log (for debugging) ---

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8504
  Network URL: http://172.28.0.12:8504
  External URL: http://34.73.176.129:8504

-----------------------------------


In [15]:
# Save results so Streamlit can read them
model.freqItemsets.write.mode("overwrite").parquet("freq_itemsets.parquet")
model.associationRules.write.mode("overwrite").parquet("rules.parquet")