In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding as crypto_padding
import base64
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string

nltk.download('punkt')
nltk.download('stopwords')


ModuleNotFoundError: No module named 'sklearn'

In [3]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

data = {
    'text': newsgroups.data,
    'label': newsgroups.target
}
df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4
...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,13
18842,\nNot in isolated ground recepticles (usually ...,12
18843,I just installed a DX2-66 CPU in a clone mothe...,3
18844,\nWouldn't this require a hyper-sphere. In 3-...,1


In [4]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Remove digits and extra spaces
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    text = ' '.join([word for word in word_tokens if word not in stop_words])

    return text

df['clean_text'] = df['text'].apply(clean_text)
df

Unnamed: 0,text,label,clean_text
0,\n\nI am sure some bashers of Pens fans are pr...,10,sure bashers pens fans pretty confused lack ki...
1,My brother is in the market for a high-perform...,3,brother market highperformance video card supp...
2,\n\n\n\n\tFinally you said what you dream abou...,17,finally said dream mediterranean new area grea...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,think scsi card dma transfers disks scsi card ...
4,1) I have an old Jasmine drive which I cann...,4,old jasmine drive use new system understanding...
...,...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,13,dn nyedacnsvaxuwecedu david nye dn neurology d...
18842,\nNot in isolated ground recepticles (usually ...,12,isolated ground recepticles usually unusual co...
18843,I just installed a DX2-66 CPU in a clone mothe...,3,installed dx cpu clone motherboard tried mount...
18844,\nWouldn't this require a hyper-sphere. In 3-...,1,wouldnt require hypersphere space points speci...


In [5]:
def generate_aes_cipher(key):
    cipher = Cipher(algorithms.AES(key), modes.CBC(os.urandom(16)))
    return cipher

def encrypt_text_aes(text, cipher):
    words = text.split()
    encrypted_words = []
    for word in words:
        padder = crypto_padding.PKCS7(128).padder()
        padded_data = padder.update(word.encode()) + padder.finalize()
        encryptor = cipher.encryptor()
        encrypted_word = encryptor.update(padded_data) + encryptor.finalize()
        encrypted_words.append(base64.b64encode(encrypted_word).decode())
    return ' '.join(encrypted_words)

def decrypt_text_aes(encrypted_text, cipher):
    encrypted_words = encrypted_text.split()
    decrypted_words = []
    for encrypted_word in encrypted_words:
        decryptor = cipher.decryptor()
        decoded_word = base64.b64decode(encrypted_word.encode())
        decrypted_word = decryptor.update(decoded_word) + decryptor.finalize()
        unpadder = crypto_padding.PKCS7(128).unpadder()
        decrypted_words.append((unpadder.update(decrypted_word) + unpadder.finalize()).decode())
    return ' '.join(decrypted_words)


# Generate a fixed AES key
key = os.urandom(32)
aes_cipher = generate_aes_cipher(key)

# Encrypt the cleaned text column in the dataframe
df['encrypted_clean_text'] = df['clean_text'].apply(lambda x: encrypt_text_aes(x, aes_cipher))


In [6]:
df['encrypted_text'] = df['encrypted_clean_text']
df['text'] = df['clean_text'] 

In [7]:
print(df['text'][1])
print(df['label'][1])

brother market highperformance video card supports vesa local bus mb ram anyone suggestionsideas diamond stealth pro local bus orchid farenheit ati graphics ultra pro highperformance vlb card please post email thank matt
3


In [8]:
df['encrypted_clean_text'][3]

'+iOjzQHLnz6KBCsFpfemmw== 0cROijn0kD96hQ4RYokuKA== 5GpU3glMeTUv8uwxs6pC0w== mqxCtH366kIs2YCopB7nZw== oW9LCdhVYVEcuW4ntylJYw== 0doYhEEhOORr1bYy/rpi8Q== 0cROijn0kD96hQ4RYokuKA== 5GpU3glMeTUv8uwxs6pC0w== mqxCtH366kIs2YCopB7nZw== oW9LCdhVYVEcuW4ntylJYw== 54D6EazbkP3nNwg8jiRuMA== x0J2aB81OhGPovv4j0o54g== 0cROijn0kD96hQ4RYokuKA== sp5kjo3ZoTJVYErmjWNRkQ== AMOdcpC4PpT82WoWS32Vmw== z5EshoRyPsywgbgOF5FfGg== WCj8IYhlp+p0GgLh8tBdng== UlZFKqeWmJCFeaGm+WiLWQ== 0cROijn0kD96hQ4RYokuKA== +guw3hLNAcombsBKEofEQg== UlnbizqWwftgssL4ETSQZg== uYPxTJBIZvnKWasOeGVlWw== HKx/uJVb4yrG7Zr076rs/A== 0cROijn0kD96hQ4RYokuKA== 2T8Fyk2jXtTG5aHs00hl/A== sp5kjo3ZoTJVYErmjWNRkQ== fS78W8eL5d9JMWEDyNz6fQ== 70oirLmC3YWyURqWULU1QQ== qtzMtWDxRm1i2ygunmRguQ== LZ2YEfkeuJyRWpeDmpY/6w== 3WR78RuF1oVNWMbBi0/r6g== oW9LCdhVYVEcuW4ntylJYw== biw6uPYYGKFSUwysXUhe4A== sp5kjo3ZoTJVYErmjWNRkQ== uYPxTJBIZvnKWasOeGVlWw== /vK/PUCxEECEoR2Yd0G09g== x0J2aB81OhGPovv4j0o54g== 2T8Fyk2jXtTG5aHs00hl/A== 3TSL8BBOedZB+9UW4IHK+w== e3dNR5qTrxR3dkLWELRmhQ==

In [9]:
from gensim.models.doc2vec import TaggedDocument
import gensim

def create_tagged_documents(texts):
    return [TaggedDocument(word_tokenize(text), [i]) for i, text in enumerate(texts)]

normal_documents = create_tagged_documents(df['text'])
encrypted_documents = create_tagged_documents(df['encrypted_text'])

normal_doc2vec_model = gensim.models.Doc2Vec(normal_documents, vector_size=100, window=5, min_count=1, workers=4, epochs=10)
encrypted_doc2vec_model = gensim.models.Doc2Vec(encrypted_documents, vector_size=100, window=5, min_count=1, workers=4, epochs=10)


In [10]:
normal_doc_vectors = np.array([normal_doc2vec_model.infer_vector(doc.words) for doc in normal_documents])
encrypted_doc_vectors = np.array([encrypted_doc2vec_model.infer_vector(doc.words) for doc in encrypted_documents])


In [11]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

def train_and_evaluate_xgboost(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_classifier.fit(X_train, y_train)
    y_pred = xgb_classifier.predict(X_test)

    print(classification_report(y_test, y_pred))

# Train and evaluate the XGBoost classifiers
print("Results for normal dataset:")
train_and_evaluate_xgboost(normal_doc_vectors, df['label'])

print("\nResults for encrypted dataset:")
train_and_evaluate_xgboost(encrypted_doc_vectors, df['label'])
print(encrypted_doc_vectors, df['label'])

Results for normal dataset:




              precision    recall  f1-score   support

           0       0.37      0.38      0.37       151
           1       0.43      0.47      0.45       202
           2       0.45      0.46      0.45       195
           3       0.34      0.39      0.36       183
           4       0.34      0.31      0.32       205
           5       0.69      0.65      0.67       215
           6       0.51      0.54      0.53       193
           7       0.45      0.44      0.45       196
           8       0.35      0.55      0.43       168
           9       0.61      0.58      0.60       211
          10       0.65      0.61      0.63       198
          11       0.69      0.62      0.65       201
          12       0.47      0.46      0.46       202
          13       0.68      0.70      0.69       194
          14       0.62      0.58      0.60       189
          15       0.59      0.62      0.61       202
          16       0.53      0.59      0.56       188
          17       0.67    



              precision    recall  f1-score   support

           0       0.30      0.34      0.32       151
           1       0.49      0.51      0.50       202
           2       0.44      0.50      0.47       195
           3       0.36      0.46      0.41       183
           4       0.36      0.32      0.34       205
           5       0.70      0.66      0.68       215
           6       0.55      0.47      0.50       193
           7       0.53      0.51      0.52       196
           8       0.38      0.54      0.44       168
           9       0.58      0.58      0.58       211
          10       0.66      0.61      0.63       198
          11       0.68      0.59      0.63       201
          12       0.41      0.43      0.42       202
          13       0.68      0.68      0.68       194
          14       0.64      0.67      0.65       189
          15       0.62      0.67      0.64       202
          16       0.54      0.55      0.54       188
          17       0.66    

LSTM

In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Reshape
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [13]:
model_clean = normal_doc2vec_model 
model_encrypted = encrypted_doc2vec_model
labels = df['label']

In [18]:
import numpy as np

def get_doc2vec_embeddings(model, texts):
    embeddings = []
    for text in texts:
        words = text.split()
        embedding = np.zeros(model.vector_size)
        word_count = 0

        for word in words:
            if word in model.wv:
                embedding += model.wv[word]
                word_count += 1

        if word_count > 0:
            embedding /= word_count

        embeddings.append(embedding)

    return np.array(embeddings)

# Get Doc2Vec embeddings
doc2vec_embeddings_clean = get_doc2vec_embeddings(model_clean, df['clean_text'])
doc2vec_embeddings_encrypted = get_doc2vec_embeddings(model_encrypted, df['encrypted_clean_text'])


# Convert labels to one-hot encoding
labels = to_categorical(df['label'])

# Split data into training and testing sets
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(doc2vec_embeddings_clean, labels, test_size=0.2, random_state=42)
X_train_encrypted, X_test_encrypted, y_train_encrypted, y_test_encrypted = train_test_split(doc2vec_embeddings_encrypted, labels, test_size=0.2, random_state=42)

X_train_clean


array([[-0.8898607 ,  0.80914363,  0.35705892, ..., -0.39982017,
         0.20058671,  0.49158667],
       [-0.78020693,  0.97336184,  0.36389062, ..., -0.4591665 ,
         0.44781539,  0.49363994],
       [-0.8788677 ,  0.71920821,  0.27195729, ..., -0.31140769,
         0.24486899,  0.56826073],
       ...,
       [-1.32699013,  0.3547475 ,  0.17285054, ..., -0.74390408,
         0.82111735,  0.59875816],
       [-0.88956872,  0.69085605,  0.34060248, ..., -0.40555981,
         0.03921853,  0.56433386],
       [-0.92768939,  0.46025967,  0.58665491, ..., -0.34025101,
         0.11807105,  0.59344615]])

In [15]:
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(tf.keras.Input(shape=input_shape))
    model.add(Reshape((1, input_shape[0])))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate the LSTM model on cleaned text data
input_shape = (model_clean.vector_size,)
lstm_model_clean = create_lstm_model(input_shape)
lstm_model_clean.fit(X_train_clean, y_train_clean, epochs=10, batch_size=64, validation_split=0.1)

y_pred_clean = np.argmax(lstm_model_clean.predict(X_test_clean), axis=-1)
y_true_clean = np.argmax(y_test_clean, axis=-1)
print("Results for cleaned text data:")
print(classification_report(y_true_clean, y_pred_clean))

# Train and evaluate the LSTM model on encrypted text data
input_shape = (model_encrypted.vector_size,)
lstm_model_encrypted = create_lstm_model(input_shape)
lstm_model_encrypted.fit(X_train_encrypted, y_train_encrypted, epochs=10, batch_size=64, validation_split=0.1)

y_pred_encrypted = np.argmax(lstm_model_encrypted.predict(X_test_encrypted), axis=-1)
y_true_encrypted = np.argmax(y_test_encrypted, axis=-1)
print("Results for encrypted text data:")
print(classification_report(y_true_encrypted, y_pred_encrypted))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Results for cleaned text data:
              precision    recall  f1-score   support

           0       0.36      0.40      0.38       151
           1       0.47      0.46      0.47       202
           2       0.51      0.59      0.55       195
           3       0.41      0.50      0.45       183
           4       0.47      0.21      0.30       205
           5       0.71      0.75      0.73       215
           6       0.68      0.67      0.67       193
           7       0.63      0.50      0.56       196
           8       0.29      0.62      0.40       168
           9       0.65      0.61      0.63       211
          10       0.74      0.65      0.69       198
          11       0.75      0.66      0.70       201
          12       0.48      0.47      0.47       202
          13       0.65      0.74      0.69       194
          14       0.64      0.70      0.67    

Encryption Test with Normal Input

In [16]:
# Predict using the encrypted model on the clean test set
y_pred_encrypted_model_on_clean = np.argmax(lstm_model_encrypted.predict(X_test_clean), axis=-1)

# Calculate the accuracy and display the classification report
encrypted_model_clean_test_accuracy = np.mean(y_pred_encrypted_model_on_clean == np.argmax(y_test_clean, axis=-1))

print("Accuracy of the encrypted model on clean test set: {:.2f}%".format(encrypted_model_clean_test_accuracy * 100))
print("\nClassification Report:")
print(classification_report(np.argmax(y_test_clean, axis=-1), y_pred_encrypted_model_on_clean))


Accuracy of the encrypted model on clean test set: 44.77%

Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.52      0.24       151
           1       0.36      0.50      0.42       202
           2       0.47      0.47      0.47       195
           3       0.53      0.14      0.22       183
           4       0.42      0.41      0.42       205
           5       0.79      0.29      0.42       215
           6       0.73      0.59      0.65       193
           7       0.57      0.27      0.36       196
           8       0.26      0.46      0.33       168
           9       0.36      0.25      0.29       211
          10       0.46      0.87      0.60       198
          11       0.81      0.58      0.67       201
          12       0.51      0.38      0.44       202
          13       0.51      0.64      0.57       194
          14       0.49      0.71      0.58       189
          15       0.47      0.79      0.59       202