In [1]:
import os, shutil, glob, time
import pandas as pd
os.environ["HADOOP_HOME"] = "D:/hadoop"
os.environ["PATH"] += os.pathsep + "D:/hadoop/bin"
os.makedirs("D:/hadoop/checkpoint", exist_ok=True)

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, from_unixtime
from pyspark.sql.types import StructType, IntegerType, DoubleType, TimestampType
import threading

In [3]:
# C·∫•u h√¨nh th∆∞ m·ª•c
output_dir = "D:/sensor-data/output"
checkpoint_dir = "D:/hadoop/checkpoint"
backup_dir = "D:/sensor-data/backup"

# T·∫°o th∆∞ m·ª•c n·∫øu ch∆∞a c√≥
os.makedirs(output_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(backup_dir, exist_ok=True)


def merge_csv_files_clean_headers(input_dir, output_file):
    csv_files = glob.glob(os.path.join(input_dir, "part-*"))
    if not csv_files:
        print("‚ö†Ô∏è Kh√¥ng c√≥ file ƒë·ªÉ g·ªôp.")
        return

    dfs = []
    for file in csv_files:
        try:
            df = pd.read_csv(file, header=None)  # Kh√¥ng c√≥ header
            # L·ªçc b·ªè c√°c d√≤ng ti√™u ƒë·ªÅ tr√πng (v√≠ d·ª•: d√≤ng ch·ª©a 'sensor_id')
            df = df[df[0] != 'sensor_id']
            dfs.append(df)
        except Exception as e:
            print(f"‚ùå L·ªói ƒë·ªçc file {file}: {e}")

    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        # G√°n l·∫°i t√™n c·ªôt chu·∫©n
        merged_df.columns = ['sensor_id', 'temperature', 'humidity', 'timestamp']
        merged_df.to_csv(output_file, index=False)
        print(f"‚úÖ ƒê√£ g·ªôp v√† l√†m s·∫°ch th√†nh {output_file}")
    else:
        print("‚ö†Ô∏è Kh√¥ng c√≥ n·ªôi dung h·ª£p l·ªá ƒë·ªÉ g·ªôp.")


# H√†m xo√° th∆∞ m·ª•c c≈©
def cleanup_dirs():
    shutil.rmtree(output_dir, ignore_errors=True)
    shutil.rmtree(checkpoint_dir, ignore_errors=True)
    print("üßπ ƒê√£ xo√° output v√† checkpoint.")


In [4]:
# Kh·ªüi t·∫°o SparkSession
spark = SparkSession.builder \
    .appName("KafkaSensorConsumerAuto") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2") \
    .config("spark.hadoop.home.dir", "D:/hadoop") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

# Schema d·ªØ li·ªáu JSON
schema = StructType() \
    .add('sensor_id', IntegerType()) \
    .add('temperature', DoubleType()) \
    .add('humidity', DoubleType()) \
    .add('timestamp', DoubleType())  

# ƒê·ªçc d·ªØ li·ªáu t·ª´ Kafka
df_raw = spark.readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', 'localhost:9092') \
    .option('subscribe', 'sensor-data') \
    .option('startingOffsets', 'latest') \
    .load()

df_parsed = df_raw.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*") \
    .withColumn("timestamp", from_unixtime(col("timestamp")).cast("string"))  

# Ghi d·ªØ li·ªáu v√†o file CSV
query = df_parsed.writeStream \
    .option("path", output_dir) \
    .option("checkpointLocation", checkpoint_dir) \
    .option("header", True) \
    .format("csv") \
    .start()

query.awaitTermination(120)  # 120 gi√¢y = 2 ph√∫t
query.stop()
print("üî¥ ƒê√£ d·ª´ng stream sau 2 ph√∫t.")



üî¥ ƒê√£ d·ª´ng stream sau 2 ph√∫t.


In [5]:
timestamp_str = time.strftime("%Y%m%d_%H%M%S")
merged_file_path = f"{backup_dir}/merged_{timestamp_str}.csv"
merge_csv_files_clean_headers(output_dir, merged_file_path)


‚úÖ ƒê√£ g·ªôp v√† l√†m s·∫°ch th√†nh D:/sensor-data/backup/merged_20250615_125608.csv


In [6]:
cleanup_dirs()

üßπ ƒê√£ xo√° output v√† checkpoint.
