In [1]:
import happybase
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType


In [2]:
spark = SparkSession.builder \
    .appName("HBase to Spark SQL") \
    .getOrCreate()


In [3]:
schema = StructType([
    StructField("productId", StringType(), True),
    StructField("totalAmount", DoubleType(), True)
])

In [6]:
def fetch_hbase_data(thrift_server, port, namespace, table_name):
    connection = happybase.Connection(thrift_server, port=port)
    table = connection.table(f"{namespace}:{table_name}")

    rows = []
    for key, data in table.scan():
        productId = key.decode('utf-8')
        totalAmount = float(data.get(b'transaction_data:totalAmount', 0))
        rows.append((productId, totalAmount))

    df = spark.createDataFrame(rows, schema)
    return df

In [7]:
thrift_server = "hbase-thrift"
port = 9090
namespace = "streaming"
table_name = "events"
df = fetch_hbase_data(thrift_server, port, namespace, table_name)


In [8]:
df.createOrReplaceTempView("streaming_data")


In [9]:
result_df = spark.sql("""
SELECT 
    productId, 
    SUM(totalAmount) as total_sales 
FROM 
    streaming_data 
GROUP BY 
    productId 
ORDER BY 
    total_sales DESC 
LIMIT 5
""")


In [10]:
result_df.show()


+---------+-----------+
|productId|total_sales|
+---------+-----------+
|    55839|     494.49|
|    34219|     316.83|
|    89559|     313.68|
|    61172|     292.36|
|    57502|      195.9|
+---------+-----------+

