In [1]:
import pandas as pd
import os
import time
import psutil
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType, col
from pyspark.sql.types import StructType, StructField, StringType
import base64

# Create SparkSession
spark = SparkSession.builder.appName("DatasetEncryption").getOrCreate()
# Read the CSV file
df = spark.read.csv(r"C:\Users\91974\Documents\Bitcoin_tweets.csv", header=True, inferSchema=True)
# Choose the number of partitions
partitions = 4
df = df.repartition(partitions)

# Import ChaCha20-Poly1305 related libraries
from cryptography.hazmat.primitives.ciphers.aead import ChaCha20Poly1305

# Define the ChaCha20-Poly1305 encryption function
@pandas_udf(returnType=StructType([
    StructField("key", StringType())
] + [
    StructField(c, StringType()) for c in df.columns
] + [
    StructField("nonce_" + c, StringType()) for c in df.columns
] + [
    StructField("encrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def chacha20_poly1305_encrypt(df: pd.DataFrame) -> pd.DataFrame:
    # Generate a ChaCha20-Poly1305 key
    key = os.urandom(32)
    # Generate a nonce
    nonce = os.urandom(12)
    # Create a ChaCha20-Poly1305 object with the key
    cipher = ChaCha20Poly1305(key)
    # Encrypt all columns
    start_time = time.time()
    for col in df.columns:
        nonce = os.urandom(12)
        df["encrypted_" + col] = df[col].apply(lambda x: base64.b64encode(cipher.encrypt(nonce, x.encode(), None)).decode() if x is not None else None)
        df["nonce_" + col] = nonce.hex()
    end_time = time.time()

    df["key"] = base64.b64encode(key).decode()

    
    
    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    
    print(f"Encryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")
    
    return df

# Define the ChaCha20-Poly1305 decryption function to decrypt all columns
@pandas_udf(returnType=StructType([
    StructField("key", StringType())
] + [
    StructField("nonce_" + c, StringType()) for c in df.columns
] + [
    StructField("decrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def chacha20_poly1305_decrypt(df: pd.DataFrame) -> pd.DataFrame:
    # Get the ChaCha20-Poly1305 key from the first row of the DataFrame
    key = base64.b64decode(df.iloc[0]["key"].encode())  # Decode the key


    # Create a ChaCha20-Poly1305 object with the key
    cipher = ChaCha20Poly1305(key)
    # Decrypt each value in the DataFrame using the ChaCha20-Poly1305 object
    start_time = time.time()
    for c in df.columns:
        if c.startswith("encrypted_"):
            nonce = bytes.fromhex(df["nonce_" + c.replace("encrypted_", "")].iloc[0])  # Decode the nonce
            df["decrypted_" + c.replace("encrypted_", "")] = df[c].apply(lambda v: cipher.decrypt(nonce, base64.b64decode(v), None).decode() if v is not None else None)
    end_time = time.time()

    decrypted_df = df[[c for c in df.columns if c.startswith("decrypted_") or c == "key" or c.startswith("nonce_")]]

    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent

    print(f"Decryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")

    return decrypted_df


# Encrypt the dataset using ChaCha20-Poly1305
start_time = time.time()
df_encrypted = df.groupBy().apply(chacha20_poly1305_encrypt)
end_time = time.time()
encryption_time = end_time - start_time
print(f"\nAverage encryption time: {encryption_time / partitions:.4f} seconds")
print(f"Overall encryption time: {encryption_time:.4f} seconds\n")
df_encrypted.show()

# Decrypt the dataset using ChaCha20-Poly1305
start_time = time.time()
df_decrypted = df_encrypted.groupBy().apply(chacha20_poly1305_decrypt)
end_time = time.time()
decryption_time = end_time - start_time
print(f"\nAverage decryption time: {decryption_time / partitions:.4f} seconds")
print(f"Overall decryption time: {decryption_time:.4f} seconds\n")
df_decrypted.show()






Average encryption time: 0.0133 seconds
Overall encryption time: 0.0533 seconds

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+
|                 key|           user_name|       user_location|     nonce_user_name| nonce_user_location| encrypted_user_name|encrypted_user_location|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+
|d/n823jMVV1CpJlFj...|                 Omz|     Los Angeles, CA|d04949b9cef86b79b...|3ba5e2de7a60982d7...|3la31/Z+UqnCY8bed...|   Doi5MJmv678SHeB1O...|
|d/n823jMVV1CpJlFj...|                 Omz|     Los Angeles, CA|d04949b9cef86b79b...|3ba5e2de7a60982d7...|3la31/Z+UqnCY8bed...|   Doi5MJmv678SHeB1O...|
|d/n823jMVV1CpJlFj...|           123TRADER|                  NL|d04949b9cef86b79b...|3ba5e2de7a60982d7...|oAn+H80kqi4Tev+Nx...|   DKuIx/4ep/FBycrrF...|
|d/n82