In [None]:
import kagglehub
import time
import pandas as pd
import gnupg
from io import StringIO
import numpy as np

# Download enron dataset and save path to directory
enron = kagglehub.dataset_download("wcukierski/enron-email-dataset")

print("Path to dataset files:", enron)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/seed/.cache/kagglehub/datasets/wcukierski/enron-email-dataset/versions/2


In [None]:
# Initialize GPG
gpg = gnupg.GPG()

password = "password"

# Generate public and private keys
input = gpg.gen_key_input(
    name_email = "pgp@utep.com",
    passphrase = password
)

key = gpg.gen_key(input)

public = gpg.export_keys(keyids=key.fingerprint, passphrase=password)
private = gpg.export_keys(keyids=key.fingerprint, secret=True, passphrase=password)

# Save keys to files
with open("public_key.asc", "w") as public_file:
    public_file.write(public)

with open("private_key.asc", "w") as private_file:
    private_file.write(private)

In [None]:
# Import enron data, only 50k rows of data because of VM memory limit
data = pd.read_csv(enron + '/emails.csv', nrows=50000)

In [None]:
data

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...
49995,cash-m/sent_items/346.,Message-ID: <1815718.1075855358574.JavaMail.ev...
49996,cash-m/sent_items/347.,Message-ID: <15801367.1075855358596.JavaMail.e...
49997,cash-m/sent_items/348.,Message-ID: <23815150.1075855358619.JavaMail.e...
49998,cash-m/sent_items/349.,Message-ID: <23974470.1075855358641.JavaMail.e...


In [None]:
# GPG cant handle some characters in the enron dataset, we enconde the data into utf-8 in order to encrypt it using gpg
serialized_data = data.to_csv(index=False).encode('utf-8')

# Encrypt the data
start = time.time()

encrypted_data = gpg.encrypt(serialized_data, recipients=["pgp@utep.com"])

encryption_time = time.time() - start

# Save .asc file with encrypted data
with open('encrypted_data.asc', 'w') as f:
    f.write(str(encrypted_data))

# Total encryption time
print(f"Encryption completed in {round(encryption_time, 2)} seconds")

Encryption completed in 3.91 seconds


In [None]:
# Load encrypted data file
with open("encrypted_data.asc", "r") as f:
    encrypted_data = f.read()

# Decrypt data
start = time.time()

decrypted_data = gpg.decrypt(encrypted_data, passphrase=password)

decryption_time = time.time() - start

# Create pandas dataframe from decrypted data
df_decrypted = pd.read_csv(StringIO(str(decrypted_data)))

# Total decryption time
print(f"Decryption completed in {round(decryption_time, 2)} seconds")

Decryption completed in 2.81 seconds


In [None]:
df_decrypted

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...
49995,cash-m/sent_items/346.,Message-ID: <1815718.1075855358574.JavaMail.ev...
49996,cash-m/sent_items/347.,Message-ID: <15801367.1075855358596.JavaMail.e...
49997,cash-m/sent_items/348.,Message-ID: <23815150.1075855358619.JavaMail.e...
49998,cash-m/sent_items/349.,Message-ID: <23974470.1075855358641.JavaMail.e...


In [None]:
# Test for data loss

comparison = (data == df_decrypted).fillna(False)
total_cells = np.product(data.shape)
mismatched_cells = total_cells - comparison.values.sum()
percentage = (mismatched_cells / total_cells) * 100

print(f"Cell loss percentage in dataframe: {round(percentage, 3)}%")


Cell loss percentage in dataframe: 0.018%
