In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, round

In [2]:
# ✅ Initialize Spark session
spark = SparkSession.builder \
    .appName("CIC Collection Analysis") \
    .getOrCreate()


In [3]:
# Dataset Path
path = r"C:\Users\sayed\Desktop\L&T-Project\Vigilix\data\raw\cic-collection.parquet\cic-collection.parquet"

print("🔹 Loading dataset...")
df = spark.read.parquet(path)
print(f"✅ Loaded dataset with {df.count():,} rows and {len(df.columns)} columns")

🔹 Loading dataset...
✅ Loaded dataset with 9,167,581 rows and 59 columns


In [4]:
# 📋 Show column names
print("\n🔹 Columns:")
print(df.columns)


🔹 Columns:
['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min'

In [5]:
# 📊 Label distribution (percentages)
print("\n🔹 Label distribution (%):")
total_count = df.count()
df.groupBy("Label") \
  .agg(
      count("*").alias("count"),
      round((count("*") / total_count) * 100, 4).alias("percentage")
  ) \
  .orderBy(col("count").desc()) \
  .show(truncate=False, n=50)


🔹 Label distribution (%):
+--------------------+-------+----------+
|Label               |count  |percentage|
+--------------------+-------+----------+
|Benign              |7186189|78.387    |
|DDoS-LOIC-HTTP      |575364 |6.2761    |
|DoS-Hulk            |318740 |3.4768    |
|DDoS-HOIC           |198861 |2.1692    |
|Botnet              |145968 |1.5922    |
|DDoS                |128062 |1.3969    |
|DDoS-NTP            |121328 |1.3234    |
|DDoS-TFTP           |98833  |1.0781    |
|Bruteforce-SSH      |97260  |1.0609    |
|Infiltration        |94857  |1.0347    |
|DoS-Goldeneye       |52324  |0.5708    |
|DDoS-Syn            |47757  |0.5209    |
|DDoS-UDP            |28863  |0.3148    |
|DoS-Slowloris       |15243  |0.1663    |
|DDoS-MSSQL          |11784  |0.1285    |
|DDoS-UDPLag         |8452   |0.0922    |
|Bruteforce-FTP      |5984   |0.0653    |
|DoS-Slowhttptest    |5271   |0.0575    |
|DDoS-Ddossim        |5115   |0.0558    |
|DDoS-DNS            |3668   |0.04      |
|DoS-Sl