In [None]:
# ============================================================
# Stock Price Big Data Analysis v·ªõi PySpark
# Ph√¢n t√≠ch d·ªØ li·ªáu ch·ª©ng kho√°n t·ª´ HDFS
# ============================================================

# Cell 1: Import th∆∞ vi·ªán v√† kh·ªüi t·∫°o Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# T·∫°o Spark Session k·∫øt n·ªëi v·ªõi Spark Master
spark = SparkSession.builder \
    .appName("Stock Price Big Data Analysis") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

print("=" * 60)
print("‚úÖ Spark Session kh·ªüi t·∫°o th√†nh c√¥ng!")
print(f"üìå Spark Version: {spark.version}")
print(f"üìå Spark Master: {spark.sparkContext.master}")
print(f"üìå Application Name: {spark.sparkContext.appName}")
print("=" * 60)

# ============================================================
# Cell 2: ƒê·ªçc d·ªØ li·ªáu t·ª´ HDFS
print("\nüìÇ ƒêang ƒë·ªçc d·ªØ li·ªáu t·ª´ HDFS...")

# ƒê·ªãnh nghƒ©a schema cho d·ªØ li·ªáu stock
schema = StructType([
    StructField("Date", DateType(), True),
    StructField("Open", DoubleType(), True),
    StructField("High", DoubleType(), True),
    StructField("Low", DoubleType(), True),
    StructField("Close", DoubleType(), True),
    StructField("Volume", LongType(), True)
])

# ƒê·ªçc t·∫•t c·∫£ file CSV t·ª´ HDFS (path: hdfs://namenode:9000/datack/)
# L∆∞u √Ω: ƒê∆∞·ªùng d·∫´n n√†y tr√πng v·ªõi c·∫•u tr√∫c trong README
df_stock = spark.read \
    .option("header", "true") \
    .option("inferSchema", "false") \
    .schema(schema) \
    .csv("hdfs://namenode:9000/datack/*.csv")

# Th√™m c·ªôt ticker t·ª´ t√™n file
df_stock = df_stock.withColumn(
    "Ticker",
    regexp_extract(input_file_name(), r"stock_market_data-([A-Z]+)_", 1)
)

print(f"‚úÖ ƒê√£ ƒë·ªçc d·ªØ li·ªáu t·ª´ HDFS th√†nh c√¥ng!")
print(f"üìä T·ªïng s·ªë records: {df_stock.count():,}")
print(f"üìä S·ªë l∆∞·ª£ng c·ªôt: {len(df_stock.columns)}")
print(f"üìä S·ªë l∆∞·ª£ng c√¥ng ty: {df_stock.select('Ticker').distinct().count()}")

# Hi·ªÉn th·ªã schema
print("\nüìã Schema c·ªßa d·ªØ li·ªáu:")
df_stock.printSchema()

# Hi·ªÉn th·ªã 10 d√≤ng ƒë·∫ßu ti√™n
print("\nüìã 10 d√≤ng d·ªØ li·ªáu ƒë·∫ßu ti√™n:")
df_stock.show(10, truncate=False)

# ============================================================
# Cell 3: L√†m s·∫°ch v√† x·ª≠ l√Ω d·ªØ li·ªáu
print("\nüîß B·∫Øt ƒë·∫ßu l√†m s·∫°ch d·ªØ li·ªáu...")

# Lo·∫°i b·ªè c√°c record c√≥ gi√° tr·ªã null
df_clean = df_stock.dropna()

# T√≠nh c√°c ch·ªâ s·ªë k·ªπ thu·∫≠t
df_clean = df_clean.withColumn("Price_Range", col("High") - col("Low"))
df_clean = df_clean.withColumn("Daily_Return", (col("Close") - col("Open")) / col("Open") * 100)
df_clean = df_clean.withColumn("Year", year(col("Date")))
df_clean = df_clean.withColumn("Month", month(col("Date")))

# Cache d·ªØ li·ªáu ƒë·ªÉ tƒÉng t·ªëc ƒë·ªô x·ª≠ l√Ω
df_clean.cache()

print(f"‚úÖ L√†m s·∫°ch d·ªØ li·ªáu ho√†n t·∫•t!")
print(f"üìä S·ªë records sau khi l√†m s·∫°ch: {df_clean.count():,}")

# Hi·ªÉn th·ªã d·ªØ li·ªáu sau khi x·ª≠ l√Ω
print("\nüìã D·ªØ li·ªáu sau khi x·ª≠ l√Ω:")
df_clean.select("Ticker", "Date", "Close", "Daily_Return", "Price_Range").show(10)

# ============================================================
# Cell 4: Ph√¢n t√≠ch th·ªëng k√™ c∆° b·∫£n
print("\nüìä PH√ÇN T√çCH TH·ªêNG K√ä C∆† B·∫¢N")
print("=" * 60)

# Th·ªëng k√™ m√¥ t·∫£
print("\n1Ô∏è‚É£ Th·ªëng k√™ m√¥ t·∫£ cho c√°c c·ªôt gi√°:")
df_clean.select("Open", "High", "Low", "Close", "Volume").describe().show()

# Top 10 c√¥ng ty c√≥ kh·ªëi l∆∞·ª£ng giao d·ªãch cao nh·∫•t
print("\n2Ô∏è‚É£ Top 10 c√¥ng ty c√≥ kh·ªëi l∆∞·ª£ng giao d·ªãch trung b√¨nh cao nh·∫•t:")
top_volume = df_clean.groupBy("Ticker") \
    .agg(avg("Volume").alias("Avg_Volume")) \
    .orderBy(desc("Avg_Volume")) \
    .limit(10)
top_volume.show()

# Top 10 c√¥ng ty c√≥ gi√° ƒë√≥ng c·ª≠a trung b√¨nh cao nh·∫•t
print("\n3Ô∏è‚É£ Top 10 c√¥ng ty c√≥ gi√° ƒë√≥ng c·ª≠a trung b√¨nh cao nh·∫•t:")
top_price = df_clean.groupBy("Ticker") \
    .agg(avg("Close").alias("Avg_Close_Price")) \
    .orderBy(desc("Avg_Close_Price")) \
    .limit(10)
top_price.show()

# Top 10 c√¥ng ty c√≥ bi·∫øn ƒë·ªông gi√° cao nh·∫•t
print("\n4Ô∏è‚É£ Top 10 c√¥ng ty c√≥ bi·∫øn ƒë·ªông gi√° cao nh·∫•t:")
top_volatility = df_clean.groupBy("Ticker") \
    .agg(avg("Price_Range").alias("Avg_Price_Range")) \
    .orderBy(desc("Avg_Price_Range")) \
    .limit(10)
top_volatility.show()

# ============================================================
# Cell 5: Ph√¢n t√≠ch xu h∆∞·ªõng theo th·ªùi gian
print("\nüìà PH√ÇN T√çCH XU H∆Ø·ªöNG THEO TH·ªúI GIAN")
print("=" * 60)

# Th·ªëng k√™ theo nƒÉm
print("\n1Ô∏è‚É£ Gi√° trung b√¨nh v√† kh·ªëi l∆∞·ª£ng giao d·ªãch theo nƒÉm:")
yearly_stats = df_clean.groupBy("Year") \
    .agg(
        avg("Close").alias("Avg_Close"),
        avg("Volume").alias("Avg_Volume"),
        count("*").alias("Total_Records")
    ) \
    .orderBy("Year")
yearly_stats.show()

# Ph√¢n t√≠ch theo th√°ng
print("\n2Ô∏è‚É£ Gi√° trung b√¨nh theo th√°ng (t·∫•t c·∫£ c√°c nƒÉm):")
monthly_stats = df_clean.groupBy("Month") \
    .agg(
        avg("Close").alias("Avg_Close"),
        avg("Daily_Return").alias("Avg_Return")
    ) \
    .orderBy("Month")
monthly_stats.show()

# ============================================================
# Cell 6: Ph√¢n t√≠ch c√°c c·ªï phi·∫øu c·ª• th·ªÉ (AAPL, GOOGL, AMZN, MSFT)
print("\nüéØ PH√ÇN T√çCH C√ÅC C·ªî PHI·∫æU QUAN TR·ªåNG")
print("=" * 60)

# L·ªçc c√°c c·ªï phi·∫øu quan tr·ªçng
major_stocks = ["AAPL", "GOOGL", "AMZN", "MSFT", "TSLA"]
df_major = df_clean.filter(col("Ticker").isin(major_stocks))

print(f"\nüìä Ph√¢n t√≠ch {len(major_stocks)} c·ªï phi·∫øu: {', '.join(major_stocks)}")

# Th·ªëng k√™ cho t·ª´ng c·ªï phi·∫øu
for ticker in major_stocks:
    df_ticker = df_major.filter(col("Ticker") == ticker)
    
    stats = df_ticker.select(
        min("Close").alias("Min_Price"),
        max("Close").alias("Max_Price"),
        avg("Close").alias("Avg_Price"),
        stddev("Close").alias("Std_Price")
    ).collect()[0]
    
    print(f"\n{ticker}:")
    print(f"  - Gi√° th·∫•p nh·∫•t: ${stats['Min_Price']:.2f}")
    print(f"  - Gi√° cao nh·∫•t: ${stats['Max_Price']:.2f}")
    print(f"  - Gi√° trung b√¨nh: ${stats['Avg_Price']:.2f}")
    print(f"  - ƒê·ªô l·ªách chu·∫©n: ${stats['Std_Price']:.2f}")

# ============================================================
# Cell 7: Ph√¢n t√≠ch t∆∞∆°ng quan
print("\nüîó PH√ÇN T√çCH T∆Ø∆†NG QUAN")
print("=" * 60)

# T√≠nh t∆∞∆°ng quan gi·ªØa c√°c bi·∫øn
print("\n1Ô∏è‚É£ Ma tr·∫≠n t∆∞∆°ng quan gi·ªØa c√°c bi·∫øn gi√°:")
correlation_cols = ["Open", "High", "Low", "Close", "Volume"]

# Chuy·ªÉn sang Pandas DataFrame ƒë·ªÉ d·ªÖ t√≠nh correlation
df_sample = df_clean.select(correlation_cols).sample(fraction=0.1).toPandas()

correlation_matrix = df_sample.corr()
print("\n", correlation_matrix)

# ============================================================
# Cell 8: Visualization v·ªõi Matplotlib
print("\nüìä T·∫†O VISUALIZATION")
print("=" * 60)

# Convert m·ªôt ph·∫ßn d·ªØ li·ªáu sang Pandas ƒë·ªÉ visualization
df_viz = df_clean.filter(col("Ticker").isin(["AAPL", "GOOGL", "MSFT"])) \
    .filter(col("Year") >= 2018) \
    .select("Date", "Ticker", "Close") \
    .toPandas()

# S·∫Øp x·∫øp theo ng√†y
df_viz = df_viz.sort_values("Date")

# Plot 1: Line chart so s√°nh gi√° c·ªï phi·∫øu
plt.figure(figsize=(14, 6))
for ticker in ["AAPL", "GOOGL", "MSFT"]:
    data = df_viz[df_viz["Ticker"] == ticker]
    plt.plot(data["Date"], data["Close"], label=ticker, linewidth=2)

plt.title("So s√°nh gi√° c·ªï phi·∫øu AAPL, GOOGL, MSFT (2018-2020)", fontsize=14, fontweight='bold')
plt.xlabel("Ng√†y", fontsize=12)
plt.ylabel("Gi√° ƒë√≥ng c·ª≠a ($)", fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("/home/jovyan/work/stock_comparison.png", dpi=300)
print("‚úÖ ƒê√£ l∆∞u bi·ªÉu ƒë·ªì: stock_comparison.png")
plt.show()

# Plot 2: Volume distribution
df_volume = top_volume.toPandas()
plt.figure(figsize=(12, 6))
plt.barh(df_volume["Ticker"], df_volume["Avg_Volume"], color='steelblue')
plt.xlabel("Kh·ªëi l∆∞·ª£ng giao d·ªãch trung b√¨nh", fontsize=12)
plt.title("Top 10 c√¥ng ty c√≥ kh·ªëi l∆∞·ª£ng giao d·ªãch cao nh·∫•t", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig("/home/jovyan/work/top_volume.png", dpi=300)
print("‚úÖ ƒê√£ l∆∞u bi·ªÉu ƒë·ªì: top_volume.png")
plt.show()

# Plot 3: Heatmap t∆∞∆°ng quan
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title("Ma tr·∫≠n t∆∞∆°ng quan gi·ªØa c√°c bi·∫øn", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig("/home/jovyan/work/correlation_heatmap.png", dpi=300)
print("‚úÖ ƒê√£ l∆∞u bi·ªÉu ƒë·ªì: correlation_heatmap.png")
plt.show()

# ============================================================
# Cell 9: L∆∞u k·∫øt qu·∫£ ph√¢n t√≠ch v√†o HDFS
print("\nüíæ L∆ØU K·∫æT QU·∫¢ PH√ÇN T√çCH V√ÄO HDFS")
print("=" * 60)

# L∆∞u top companies theo volume
print("\n1Ô∏è‚É£ L∆∞u top companies by volume...")
top_volume.write.mode("overwrite").parquet("hdfs://namenode:9000/output/top_volume")
print("‚úÖ ƒê√£ l∆∞u v√†o: hdfs://namenode:9000/output/top_volume")

# L∆∞u yearly statistics
print("\n2Ô∏è‚É£ L∆∞u yearly statistics...")
yearly_stats.write.mode("overwrite").parquet("hdfs://namenode:9000/output/yearly_stats")
print("‚úÖ ƒê√£ l∆∞u v√†o: hdfs://namenode:9000/output/yearly_stats")

# L∆∞u processed data
print("\n3Ô∏è‚É£ L∆∞u processed data sample...")
df_clean.filter(col("Year") == 2020).write.mode("overwrite") \
    .parquet("hdfs://namenode:9000/output/processed_2020")
print("‚úÖ ƒê√£ l∆∞u v√†o: hdfs://namenode:9000/output/processed_2020")

# ============================================================
# Cell 10: T·ªïng k·∫øt v√† d·ªçn d·∫πp
print("\n" + "=" * 60)
print("‚úÖ HO√ÄN TH√ÄNH PH√ÇN T√çCH D·ªÆ LI·ªÜU CH·ª®NG KHO√ÅN")
print("=" * 60)

print("\nüìä T√≥m t·∫Øt k·∫øt qu·∫£:")
print(f"  ‚Ä¢ T·ªïng s·ªë records ƒë√£ x·ª≠ l√Ω: {df_clean.count():,}")
print(f"  ‚Ä¢ S·ªë l∆∞·ª£ng c√¥ng ty: {df_clean.select('Ticker').distinct().count()}")
print(f"  ‚Ä¢ Kho·∫£ng th·ªùi gian: {df_clean.select(min('Date')).collect()[0][0]} - {df_clean.select(max('Date')).collect()[0][0]}")
print(f"  ‚Ä¢ S·ªë bi·ªÉu ƒë·ªì ƒë√£ t·∫°o: 3")
print(f"  ‚Ä¢ S·ªë file output tr√™n HDFS: 3")

print("\nüìÅ C√°c file ƒë√£ t·∫°o:")
print("  ‚Ä¢ /home/jovyan/work/stock_comparison.png")
print("  ‚Ä¢ /home/jovyan/work/top_volume.png")
print("  ‚Ä¢ /home/jovyan/work/correlation_heatmap.png")

print("\nüìÅ D·ªØ li·ªáu ƒë√£ l∆∞u tr√™n HDFS:")
print("  ‚Ä¢ hdfs://namenode:9000/output/top_volume")
print("  ‚Ä¢ hdfs://namenode:9000/output/yearly_stats")
print("  ‚Ä¢ hdfs://namenode:9000/output/processed_2020")

# D·ª´ng Spark Session
# spark.stop()
print("\nüéâ Ho√†n t·∫•t! Spark Session v·∫´n ƒëang ch·∫°y.")