In [None]:
import pandas as pd
import os
import time
import psutil
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType, col
from pyspark.sql.types import StructType, StructField, StringType

import base64

# Create SparkSession
spark = SparkSession.builder.appName("DatasetEncryption").getOrCreate()
# Read the CSV file
df = spark.read.csv(r"C:\Users\91974\Documents\Bitcoin_tweets.csv", header=True, inferSchema=True)
# Choose the number of partitions
partitions = 4
df = df.repartition(partitions)

# Define the DES encryption function
@pandas_udf(returnType=StructType([
    StructField("key", StringType()),
    StructField("iv", StringType())
] + [
    StructField(c, StringType()) for c in df.columns
] + [
    StructField("encrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def des_encrypt(df: pd.DataFrame) -> pd.DataFrame:
    from Crypto.Cipher import DES
    from Crypto.Util.Padding import pad, unpad
    import base64
    
    key = os.urandom(8)
    iv = os.urandom(8)
    cipher = DES.new(key, DES.MODE_CBC, iv)

    def pad_and_encrypt(value):
        if value is None:
            return None
        padded_data = pad(value.encode(), 8)
        return base64.b64encode(cipher.encrypt(padded_data)).decode()

    # Encrypt all columns
    start_time = time.time()
    for col in df.columns:
        df["encrypted_" + col] = df[col].apply(lambda x: pad_and_encrypt(x))
    end_time = time.time()

    df["key"] = base64.b64encode(key).decode()
    df["iv"] = base64.b64encode(iv).decode()
    
    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    
    print(f"Encryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")
    
    return df

# Define the DES decryption function to decrypt all columns
@pandas_udf(returnType=StructType([
     StructField("key", StringType()),
     StructField("iv", StringType())
] + [StructField("decrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def des_decrypt(df: pd.DataFrame) -> pd.DataFrame:
    from Crypto.Cipher import DES
    from Crypto.Util.Padding import unpad

    # Get the DES key and IV from the first row of the DataFrame
    key = base64.b64decode(df.iloc[0]["key"])  # Decode the key
    iv = base64.b64decode(df.iloc[0]["iv"])    # Decode the IV

    # Create a DES object with the key
    cipher = DES.new(key, DES.MODE_CBC, iv)

    # Decrypt each value in the DataFrame using the DES object
    start_time = time.time()
    for c in df.columns:
        if c.startswith("encrypted_"):
            df["decrypted_" + c.replace("encrypted_", "")] = df[c].apply(
                lambda v: unpad(cipher.decrypt(base64.b64decode(v)), 8).decode() if v is not None else None)
    end_time = time.time()
    
    # Return the decrypted DataFrame
    decrypted_df = df[[c for c in df.columns if c.startswith("decrypted_") or c == "key" or c == "iv"]]

    
    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    
    print(f"Decryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")
    return decrypted_df


# Encrypt the dataset using DES
start_time = time.time()
df_encrypted = df.groupBy().apply(des_encrypt)
end_time = time.time()
encryption_time = end_time - start_time
print(f"\nAverage encryption time: {encryption_time / partitions:.4f} seconds")
print(f"Overall encryption time: {encryption_time:.4f} seconds\n")
df_encrypted.show()

# Decrypt the dataset using DES
start_time = time.time()
df_decrypted = df_encrypted.groupBy().apply(des_decrypt)
end_time = time.time()
decryption_time = end_time - start_time
print(f"\nAverage decryption time: {decryption_time / partitions:.4f} seconds")
print(f"Overall decryption time: {decryption_time:.4f} seconds\n")
df_decrypted.show()   


Average encryption time: 0.0060 seconds
Overall encryption time: 0.0240 seconds

+------------+------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------------+
|         key|          iv|           user_name|       user_location|    user_description| encrypted_user_name|encrypted_user_location|encrypted_user_description|
+------------+------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------------+
|UoBFH+0+5Zk=|aRHnDO7QmPA=|         Muhib Rabby|                null|                null|ihQpowIR92Cymw2bN...|                   null|                      null|
|UoBFH+0+5Zk=|aRHnDO7QmPA=|I think im 18 yea...|                null|                null|NQPGhDhihzQPmvNCj...|                   null|                      null|
|UoBFH+0+5Zk=|aRHnDO7QmPA=|        Masir Ahamed|                null|                null|ShiXTQcxt9Ifj