In [1]:
import pandas as pd
import os
import time
import psutil
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType, col
from pyspark.sql.types import StructType, StructField, StringType
import base64

# Create SparkSession
spark = SparkSession.builder.appName("DatasetEncryption").getOrCreate()
# Read the CSV file
df = spark.read.csv(r"C:\Users\91974\Documents\Bitcoin_tweets.csv", header=True, inferSchema=True)
# Choose the number of partitions
partitions = 4
df = df.repartition(partitions)

# Import ChaCha20-Poly1305 related libraries
from cryptography.hazmat.primitives.ciphers.aead import ChaCha20Poly1305

# Define the ChaCha20-Poly1305 encryption function
@pandas_udf(returnType=StructType([
    StructField("key", StringType())
] + [
    StructField(c, StringType()) for c in df.columns
] + [
    StructField("nonce_" + c, StringType()) for c in df.columns
] + [
    StructField("encrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def chacha20_poly1305_encrypt(df: pd.DataFrame) -> pd.DataFrame:
    # Generate a ChaCha20-Poly1305 key
    key = os.urandom(32)
    # Generate a nonce
    nonce = os.urandom(12)
    # Create a ChaCha20-Poly1305 object with the key
    cipher = ChaCha20Poly1305(key)
    # Encrypt all columns
    start_time = time.time()
    for col in df.columns:
        nonce = os.urandom(12)
        df["encrypted_" + col] = df[col].apply(lambda x: base64.b64encode(cipher.encrypt(nonce, x.encode(), None)).decode() if x is not None else None)
        df["nonce_" + col] = nonce.hex()
    end_time = time.time()

    df["key"] = base64.b64encode(key).decode()

    
    
    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    
    print(f"Encryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")
    
    return df

# Define the ChaCha20-Poly1305 decryption function to decrypt all columns
@pandas_udf(returnType=StructType([
    StructField("key", StringType())
] + [
    StructField("nonce_" + c, StringType()) for c in df.columns
] + [
    StructField("decrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def chacha20_poly1305_decrypt(df: pd.DataFrame) -> pd.DataFrame:
    # Get the ChaCha20-Poly1305 key from the first row of the DataFrame
    key = base64.b64decode(df.iloc[0]["key"].encode())  # Decode the key


    # Create a ChaCha20-Poly1305 object with the key
    cipher = ChaCha20Poly1305(key)
    # Decrypt each value in the DataFrame using the ChaCha20-Poly1305 object
    start_time = time.time()
    for c in df.columns:
        if c.startswith("encrypted_"):
            nonce = bytes.fromhex(df["nonce_" + c.replace("encrypted_", "")].iloc[0])  # Decode the nonce
            df["decrypted_" + c.replace("encrypted_", "")] = df[c].apply(lambda v: cipher.decrypt(nonce, base64.b64decode(v), None).decode() if v is not None else None)
    end_time = time.time()

    decrypted_df = df[[c for c in df.columns if c.startswith("decrypted_") or c == "key" or c.startswith("nonce_")]]

    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent

    print(f"Decryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")

    return decrypted_df


# Encrypt the dataset using ChaCha20-Poly1305
start_time = time.time()
df_encrypted = df.groupBy().apply(chacha20_poly1305_encrypt)
end_time = time.time()
encryption_time = end_time - start_time
print(f"\nAverage encryption time: {encryption_time / partitions:.4f} seconds")
print(f"Overall encryption time: {encryption_time:.4f} seconds\n")
df_encrypted.show()

# Decrypt the dataset using ChaCha20-Poly1305
start_time = time.time()
df_decrypted = df_encrypted.groupBy().apply(chacha20_poly1305_decrypt)
end_time = time.time()
decryption_time = end_time - start_time
print(f"\nAverage decryption time: {decryption_time / partitions:.4f} seconds")
print(f"Overall decryption time: {decryption_time:.4f} seconds\n")
df_decrypted.show()






Average encryption time: 0.0443 seconds
Overall encryption time: 0.1771 seconds

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+-----------------------+--------------------------+
|                 key|           user_name|       user_location|    user_description|     nonce_user_name| nonce_user_location|nonce_user_description| encrypted_user_name|encrypted_user_location|encrypted_user_description|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+-----------------------+--------------------------+
|hWgch0tS2a0kVfVby...|         Muhib Rabby|                null|                null|c793fa5ca33411206...|a0d8e8228f2326b57...|  5482852ef099d61d2...|B+2en2t3CIrf2/do2...|                   null|                      null|
|hWgch0tS2a0kVfVby...|I th

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50329)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\Users\91974\anaconda3\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\91974\anaconda3\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\91974\anaconda3\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Users\91974\anaconda3\lib\site-packages\py4j\clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receivi

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it