In [2]:
import pandas as pd
import os
import time
import psutil
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType, col
from pyspark.sql.types import StructType, StructField, StringType
import base64

# Create SparkSession
spark = SparkSession.builder.appName("DatasetEncryption").getOrCreate()
# Read the CSV file
df = spark.read.csv(r"C:\Users\91974\Documents\Bitcoin_tweets.csv", header=True, inferSchema=True)
# Choose the number of partitions
partitions = 4
df = df.repartition(partitions)

# Define the Blowfish encryption function
@pandas_udf(returnType=StructType([
    StructField("key", StringType()),
    StructField("iv", StringType())
] + [
    StructField("encrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def blowfish_encrypt(df: pd.DataFrame) -> pd.DataFrame:
    from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
    from cryptography.hazmat.primitives import padding
    import base64

    key = os.urandom(32)
    iv = os.urandom(8)
    cipher = Cipher(algorithms.Blowfish(key), modes.CBC(iv))

    def pad_and_encrypt(value):
        if value is None:
            return None
        padder = padding.PKCS7(64).padder()
        padded_data = padder.update(value.encode()) + padder.finalize()
        encryptor = cipher.encryptor()
        return base64.b64encode(encryptor.update(padded_data) + encryptor.finalize()).decode()

    # Encrypt all original columns
    start_time = time.time()
    for col in df.columns:
        if not col.startswith("encrypted_") and col not in ["key", "iv"]:
            encrypted_col = "encrypted_" + col
            df[encrypted_col] = df[col].apply(lambda x: pad_and_encrypt(x))
    end_time = time.time()

    df["key"] = base64.b64encode(key).decode()
    df["iv"] = base64.b64encode(iv).decode()

    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent

    print(f"Encryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")

    # Return only the encrypted columns, key, and IV
    encrypted_df = df[["key", "iv"] + ["encrypted_" + c for c in df.columns if not c.startswith("encrypted_") and c not in ["key", "iv"]]]
    return encrypted_df

# Define the Blowfish decryption function to decrypt all columns
@pandas_udf(returnType=StructType([
    StructField("decrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def blowfish_decrypt(df: pd.DataFrame) -> pd.DataFrame:
    from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
    from cryptography.hazmat.primitives import padding

    # Get the Blowfish key and IV from the first row of the DataFrame
    key = base64.b64decode(df.iloc[0]["key"])  # Decode the key
    iv = base64.b64decode(df.iloc[0]["iv"])    # Decode the IV

    # Create a Blowfish cipher object with the key
    cipher = Cipher(algorithms.Blowfish(key), modes.CBC(iv))
    def unpad_and_decrypt(value):
        if value is None:
            return None
        decryptor = cipher.decryptor()
        decrypted_data = decryptor.update(base64.b64decode(value.encode())) + decryptor.finalize()
        unpadder = padding.PKCS7(64).unpadder()
        return unpadder.update(decrypted_data) + unpadder.finalize()

    for c in df.columns:
        if c.startswith("encrypted_"):
            df["decrypted_" + c.replace("encrypted_", "")] = df[c].apply(lambda x: unpad_and_decrypt(x))
    end_time = time.time()

    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent

    print(f"Decryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")

    # Return only the decrypted columns
    decrypted_df = df[[c for c in df.columns if c.startswith("decrypted_")]]
    return decrypted_df

# Encrypt the dataset using Blowfish
start_time = time.time()
df_encrypted = df.groupBy().apply(blowfish_encrypt)
end_time = time.time()
encryption_time = end_time - start_time
print(f"\nAverage encryption time: {encryption_time / partitions:.4f} seconds")
print(f"Overall encryption time: {encryption_time:.4f} seconds\n")
df_encrypted.show()

# Decrypt the dataset using Blowfish
start_time = time.time()
df_decrypted = df_encrypted.groupBy().apply(blowfish_decrypt)
end_time = time.time()
decryption_time = end_time - start_time
print(f"\nAverage decryption time: {decryption_time / partitions:.4f} seconds")
print(f"Overall decryption time: {decryption_time:.4f} seconds\n")
df_decrypted.show()



Average encryption time: 0.0147 seconds
Overall encryption time: 0.0590 seconds

+--------------------+------------+--------------------+----------------------+--------------------+--------------------------+----------------------+-----------------------------+----------------+----------------------+-----------------------+--------------------+--------------------+--------------------+----------------------+------------------------+--------------------+----------------------+
|                 key|          iv|encrypted_product_id|encrypted_product_name|  encrypted_category|encrypted_discounted_price|encrypted_actual_price|encrypted_discount_percentage|encrypted_rating|encrypted_rating_count|encrypted_about_product|   encrypted_user_id| encrypted_user_name| encrypted_review_id|encrypted_review_title|encrypted_review_content|  encrypted_img_link|encrypted_product_link|
+--------------------+------------+--------------------+----------------------+--------------------+------------------

+--------------------+----------------------+--------------------+--------------------------+----------------------+-----------------------------+----------------+----------------------+-----------------------+--------------------+--------------------+--------------------+----------------------+------------------------+--------------------+----------------------+
|decrypted_product_id|decrypted_product_name|  decrypted_category|decrypted_discounted_price|decrypted_actual_price|decrypted_discount_percentage|decrypted_rating|decrypted_rating_count|decrypted_about_product|   decrypted_user_id| decrypted_user_name| decrypted_review_id|decrypted_review_title|decrypted_review_content|  decrypted_img_link|decrypted_product_link|
+--------------------+----------------------+--------------------+--------------------------+----------------------+-----------------------------+----------------+----------------------+-----------------------+--------------------+--------------------+----------------