In [2]:
import findspark
findspark.init()
import pandas as pd
import time
import psutil
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType, col
from pyspark.sql.types import StructType, StructField, StringType
from cryptography.fernet import Fernet
# Create SparkSession
spark = SparkSession.builder.appName("DatasetEncryption").getOrCreate()
# Read the CSV file
df = spark.read.csv(r"C:\Users\91974\Documents\Bitcoin_tweets.csv", header=True, inferSchema=True)
# Choose the number of partitions
partitions = 4
df = df.repartition(partitions)
# Define the AES encryption function
@pandas_udf(returnType=StructType([
    StructField("key", StringType())
] + [StructField(c, StringType()) for c in df.columns] + [
    StructField("encrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def aes_encrypt(df: pd.DataFrame) -> pd.DataFrame:
    # Generate a Fernet key
    key = Fernet.generate_key()
    # Create a Fernet object with the key
    f = Fernet(key)
    # Encrypt each value in the DataFrame using the Fernet object
    start_time = time.time()
    encrypted_cols = {}
    for c in df.columns:
        if c != "key":
            encrypted_cols["encrypted_" + c] = df[c].apply(lambda v: f.encrypt(v.encode()).decode() if v is not None else None)
    end_time = time.time()
    encrypted_df = pd.concat([df, pd.DataFrame(encrypted_cols)], axis=1)
    encrypted_df["key"] = key.decode()
    # Calculate CPU usage and memory usage
    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    # Return the encrypted DataFrame
    print(f"Encryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")
    return encrypted_df
# Define the AES decryption function to decrypt all columns
@pandas_udf(returnType=StructType([
    StructField("key", StringType())
] + [StructField("decrypted_" + c, StringType()) for c in df.columns
]), functionType=PandasUDFType.GROUPED_MAP)
def aes_decrypt(df: pd.DataFrame) -> pd.DataFrame:
    # Get the Fernet key from the first row of the DataFrame
    key = df.iloc[0]["key"].encode()
    # Create a Fernet object with the key
    f = Fernet(key)
    # Decrypt each value in the DataFrame using the Fernet object
    start_time = time.time()
    for c in df.columns:
        if c.startswith("encrypted_"):
            df["decrypted_" + c.replace("encrypted_", "")] = df[c].apply(lambda v: f.decrypt(v.encode()).decode() if v is not None else None)
    end_time = time.time()
    # Calculate CPU usage and memory usage
    cpu_usage = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    print(f"Decryption time: {end_time - start_time:.4f} seconds")
    print(f"CPU usage: {cpu_usage:.2f}%")
    print(f"Memory usage: {memory_usage:.2f}%")
    # Return the decrypted DataFrame
    decrypted_df = df[[c for c in df.columns if c.startswith("decrypted_") or c == "key"]]
    return decrypted_df

# Encrypt the dataset using AES
start_time = time.time()
df_encrypted = df.groupBy().apply(aes_encrypt)
end_time = time.time()
encryption_time = end_time - start_time
print(f"\nAverage encryption time: {encryption_time/partitions:.4f} seconds")
print(f"Overall encryption time: {encryption_time:.4f} seconds\n")
df_encrypted.show()

# Decrypt the dataset using AES
start_time = time.time()
df_decrypted = df_encrypted.groupBy().apply(aes_decrypt)
end_time = time.time()
decryption_time = end_time - start_time
print(f"\nAverage decryption time: {decryption_time/partitions:.4f} seconds")
print(f"Overall decryption time: {decryption_time:.4f} seconds\n")
df_decrypted.show()





Average encryption time: 0.0215 seconds
Overall encryption time: 0.0861 seconds

+--------------------+--------------------+--------------------+--------------------+-----------------------+
|                 key|           user_name|       user_location| encrypted_user_name|encrypted_user_location|
+--------------------+--------------------+--------------------+--------------------+-----------------------+
|gKVu1XamUOvdL8UfW...|                 Omz|     Los Angeles, CA|gAAAAABkSpL0TAVYZ...|   gAAAAABkSpNDb5ZRl...|
|gKVu1XamUOvdL8UfW...|                 Omz|     Los Angeles, CA|gAAAAABkSpL0dCL25...|   gAAAAABkSpNDlueOo...|
|gKVu1XamUOvdL8UfW...|           123TRADER|                  NL|gAAAAABkSpL0R2h68...|   gAAAAABkSpND5dcIW...|
|gKVu1XamUOvdL8UfW...|        CryptoCloaks|Printer Bed, Minn...|gAAAAABkSpL0dkmK2...|   gAAAAABkSpNDQL0_3...|
|gKVu1XamUOvdL8UfW...|          Mark Hendy|              Online|gAAAAABkSpL0winJi...|   gAAAAABkSpNDmC258...|
|gKVu1XamUOvdL8UfW...|Orlando Alzugara