In [None]:
!pip install --upgrade --force-reinstall numpy pandas scipy matplotlib seaborn

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, when, split, hour
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Configure display
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("📚 Libraries imported successfully!")


In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("CloudClickAnalysis-Jupyter") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

print(f"✅ Spark version: {spark.version}")
print(f"🌐 Spark UI: {spark.sparkContext.uiWebUrl}")
print(f"📊 Spark Context: {spark.sparkContext}")


In [None]:
# Configure paths (update bucket name as needed)
BUCKET_NAME = "spark-click-analysis-20250629-231200-unique"  # Replace with your bucket
input_path = f"gs://{BUCKET_NAME}/input/data.txt"
output_path = f"gs://{BUCKET_NAME}/output/"

print(f"📂 Input: {input_path}")
print(f"💾 Output: {output_path}")

# Load and parse the click data
print("📊 Loading click data from GCS...")

# Read as text and parse the format: "10-Jan 11:10 1001"
df_raw = spark.read.text(input_path)

df_parsed = df_raw.select(
    split(col("value"), " ").alias("parts")
).select(
    col("parts")[0].alias("date_part"),
    col("parts")[1].alias("time_part"), 
    col("parts")[2].alias("user_id")
).filter(
    col("date_part").isNotNull() & 
    col("time_part").isNotNull() & 
    col("user_id").isNotNull()
)

# Extract hour from time
df_clicks = df_parsed.withColumn(
    "hour",
    split(col("time_part"), ":")[0].cast("int")
)

total_records = df_raw.count()
valid_records = df_clicks.count()

print(f"📈 Total records: {total_records}")
print(f"✅ Valid records: {valid_records}")
print(f"📊 Success rate: {(valid_records/total_records)*100:.1f}%")

# Display sample data
print("📋 Sample Data:")
df_clicks.select("date_part", "time_part", "user_id", "hour").show(10, False)


In [None]:
# Time-based analysis (6-hour intervals)
df_with_intervals = df_clicks.withColumn(
    "time_interval",
    when((col("hour") >= 0) & (col("hour") < 6), "00-06 Night")
    .when((col("hour") >= 6) & (col("hour") < 12), "06-12 Morning")
    .when((col("hour") >= 12) & (col("hour") < 18), "12-18 Afternoon")
    .when((col("hour") >= 18) & (col("hour") < 24), "18-24 Evening")
    .otherwise("Unknown")
)

# Analyze time intervals
interval_analysis = df_with_intervals.groupBy("time_interval") \
    .agg(count("*").alias("click_count")) \
    .orderBy(desc("click_count"))

print("📊 Time Interval Analysis:")
interval_analysis.show()

# User activity analysis
user_analysis = df_clicks.groupBy("user_id") \
    .agg(count("*").alias("click_count")) \
    .orderBy(desc("click_count"))

print("👥 Top User Activity:")
user_analysis.show(10)

# Generate summary
interval_results = interval_analysis.collect()
total_clicks = sum(row.click_count for row in interval_results)
peak_interval = interval_results[0]

print(f"\n🎯 ANALYSIS SUMMARY:")
print(f"📊 Total clicks: {total_clicks}")
print(f"👥 Unique users: {user_analysis.count()}")
print(f"🏆 Peak activity: {peak_interval['time_interval']} ({peak_interval['click_count']} clicks)")
print(f"📈 Peak percentage: {(peak_interval['click_count']/total_clicks)*100:.1f}%")

print("\n✅ Interactive analysis ready! Modify and run cells to explore the data further.")
