In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('cryptography_dataset_processed.csv')

y = df['Algorithm']

X = df.drop(['Algorithm'], axis=1)

In [4]:
X

Unnamed: 0,Key,Ciphertext,Key Length (bits),Ciphertext Length (bytes)
0,ef7d8f79011629e953bf4f7738317bb5,5bff23c6f03e37c1,0.036866,0.000000
1,3453d3b1750432cf,b1366a30a0c3b8f68093c601b061180d,0.000000,0.032258
2,df3445340794dcce3d07bcc1ad375ece524ab5d001073b08,c0917f1c9df3d9b3965ac26779314db5,0.073733,0.032258
3,-----BEGIN PUBLIC KEY-----\nMIIBIjANBgkqhkiG9w...,c20b03112d92b17d2daaac7e868fd6b697f436a03f8dbb...,1.000000,1.000000
4,1db0711ab8922217c4bf74c613073d8c,428aba00a39ee506e87c1b2924e8e219,0.036866,0.032258
...,...,...,...,...
119995,7d6efe610da52f8e4bc51e7269cef6ae,6578616d706c6539393939,0.036866,0.012097
119996,7daaa64b9a1afa119c012f5b0b0df7be,6578616d706c6539393939,0.036866,0.012097
119997,d59f776d74766370c2d68776b60112ce,6578616d706c6539393939,0.036866,0.012097
119998,-----BEGIN PUBLIC KEY-----\nMFkwEwYHKoZIzj0CAQ...,6578616d706c6539393939,0.373272,0.012097


In [5]:
y

0                AES
1                DES
2         Triple DES
3                RSA
4           Blowfish
             ...    
119995      Camellia
119996       Serpent
119997       ElGamal
119998           ECC
119999          GOST
Name: Algorithm, Length: 120000, dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)

In [7]:
y

array([ 0,  4, 11, ...,  6,  5,  7])

In [7]:
df['Algorithm'].value_counts()

Algorithm
AES           10000
DES           10000
Triple DES    10000
RSA           10000
Blowfish      10000
ChaCha20      10000
RC4           10000
Camellia      10000
Serpent       10000
ElGamal       10000
ECC           10000
GOST          10000
Name: count, dtype: int64

In [None]:
df.isnull().sum()

Algorithm                    0
Key                          0
Ciphertext                   0
Key Length (bits)            0
Ciphertext Length (bytes)    0
dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((96000, 4), (24000, 4), (96000,), (24000,))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1500)

X_key = tfidf_vectorizer.fit_transform(df['Key'])
X_ciphertext = tfidf_vectorizer.fit_transform(df['Ciphertext'])


In [9]:
X_df = [X_key, X_ciphertext, X['Key Length (bits)'], X['Ciphertext Length (bytes)']]

In [None]:
import scipy.sparse as sp
import numpy as np

X_numeric = np.array(X[['Key Length (bits)', 'Ciphertext Length (bytes)']])
X_numeric_sparse = sp.csr_matrix(X_numeric)

X_combined = sp.hstack([X_key, X_ciphertext, X_numeric_sparse])


In [30]:
X_combined

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 336604 stored elements and shape (120000, 3002)>

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.2f}')


Random Forest Accuracy: 0.58


With DeepLearning

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Hyperparameters
MAX_WORDS = 1000
MAX_LENGTH = 100

# Tokenizer for text
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['Key'].astype(str) + " " + df['Ciphertext'].astype(str))

# Convert text to sequences
X_key_seq = tokenizer.texts_to_sequences(df['Key'].astype(str))
X_ciphertext_seq = tokenizer.texts_to_sequences(df['Ciphertext'].astype(str))

# Padding to ensure uniform input shape
X_key_padded = pad_sequences(X_key_seq, maxlen=MAX_LENGTH)
X_ciphertext_padded = pad_sequences(X_ciphertext_seq, maxlen=MAX_LENGTH)

# Convert numerical columns to numpy array
X_numeric = np.array(df[['Key Length (bits)', 'Ciphertext Length (bytes)']])

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Algorithm'])

# Convert to PyTorch tensors
X_key_tensor = torch.tensor(X_key_padded, dtype=torch.long)
X_ciphertext_tensor = torch.tensor(X_ciphertext_padded, dtype=torch.long)
X_numeric_tensor = torch.tensor(X_numeric, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Train-test split
X_key_train, X_key_test, X_cipher_train, X_cipher_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_key_tensor, X_ciphertext_tensor, X_numeric_tensor, y_tensor, test_size=0.2, random_state=42
)

# Create PyTorch Dataset and DataLoader
train_dataset = TensorDataset(X_key_train, X_cipher_train, X_num_train, y_train)
test_dataset = TensorDataset(X_key_test, X_cipher_test, X_num_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [11]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()

        # LSTM for 'Key'
        self.embedding_key = nn.Embedding(vocab_size, embed_dim)
        self.lstm_key = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        
        # LSTM for 'Ciphertext'
        self.embedding_cipher = nn.Embedding(vocab_size, embed_dim)
        self.lstm_cipher = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        
        # Fully connected layer for numerical features
        self.fc_numeric = nn.Linear(2, 32)  # 2 numerical features

        # Final classification layer
        self.fc_final = nn.Linear(hidden_dim * 2 + 32, num_classes)  # Combine LSTMs + numerical features
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, key, cipher, numeric):
        # Key sequence through LSTM
        key_embedded = self.embedding_key(key)
        _, (key_hidden, _) = self.lstm_key(key_embedded)
        
        # Ciphertext sequence through LSTM
        cipher_embedded = self.embedding_cipher(cipher)
        _, (cipher_hidden, _) = self.lstm_cipher(cipher_embedded)
        
        # Flatten LSTM outputs
        key_hidden = key_hidden[-1]
        cipher_hidden = cipher_hidden[-1]

        # Process numerical features
        numeric_features = torch.relu(self.fc_numeric(numeric))

        # Concatenate all features
        combined = torch.cat((key_hidden, cipher_hidden, numeric_features), dim=1)
        output = self.fc_final(self.dropout(combined))

        return output


In [12]:
# Hyperparameters
EMBED_DIM = 64
HIDDEN_DIM = 32
NUM_CLASSES = len(label_encoder.classes_)
EPOCHS = 5
LEARNING_RATE = 0.001

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(vocab_size=MAX_WORDS, embed_dim=EMBED_DIM, hidden_dim=HIDDEN_DIM, num_classes=NUM_CLASSES)
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct, total = 0, 0
    
    for X_key_batch, X_cipher_batch, X_num_batch, y_batch in train_loader:
        X_key_batch, X_cipher_batch, X_num_batch, y_batch = X_key_batch.to(device), X_cipher_batch.to(device), X_num_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_key_batch, X_cipher_batch, X_num_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        correct += (outputs.argmax(dim=1) == y_batch).sum().item()
        total += y_batch.size(0)
    
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {correct/total:.4f}")


Epoch 1/5, Loss: 1.8132, Accuracy: 0.3342


KeyboardInterrupt: 

In [None]:
# Evaluation
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for X_key_batch, X_cipher_batch, X_num_batch, y_batch in test_loader:
        X_key_batch, X_cipher_batch, X_num_batch, y_batch = X_key_batch.to(device), X_cipher_batch.to(device), X_num_batch.to(device), y_batch.to(device)

        outputs = model(X_key_batch, X_cipher_batch, X_num_batch)
        correct += (outputs.argmax(dim=1) == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Test Accuracy: {correct/total:.4f}")


Another dataset

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("cryptography_dataset_enhanced.csv")  # Ensure this contains plaintext, ciphertext, and algorithm

# Encode labels
label_encoder = LabelEncoder()
df['algorithm_label'] = label_encoder.fit_transform(df['Algorithm'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['Ciphertext'], df['algorithm_label'], test_size=0.2, random_state=42)


In [28]:
df["algorithm_label"].value_counts()

algorithm_label
1    2558
2    2533
4    2514
5    2510
6    2508
3    2479
0    2472
7    2426
Name: count, dtype: int64

In [22]:
import hashlib
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature extraction using character frequency and hashing
def extract_features(cipher_texts):
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 1))
    return vectorizer.fit_transform(cipher_texts)

X_train_features = extract_features(X_train)
X_test_features = extract_features(X_test)


In [23]:
X_train_features.shape, X_test_features.shape, y_train.shape, y_test.shape

((80000, 39), (20000, 39), (80000,), (20000,))

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train model
model2 = XGBClassifier(n_estimators=100, random_state=42)
model2.fit(X_train_features, y_train)


# model = KNeighborsClassifier(n_neighbors=100)
# model.fit(X_train_features, y_train)

# Predict
y_pred = model2.predict(X_test_features)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


Model Accuracy: 0.5851


In [18]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# Load dataset
df = pd.read_csv("crypto_dataset_large2.csv")  # Ensure it contains cipher_text & algorithm

# Encode labels (algorithm -> numeric)
label_encoder = LabelEncoder()
df['algorithm_label'] = label_encoder.fit_transform(df['Algorithm'])
print(df['algorithm_label'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['Ciphertext'], df['algorithm_label'], test_size=0.2, random_state=42)


0        2
1        2
2        2
3        2
4        2
        ..
99995    4
99996    4
99997    4
99998    4
99999    4
Name: algorithm_label, Length: 100000, dtype: int32


In [None]:
# Create a character vocabulary from ciphertext
all_chars = "".join(df['Ciphertext'])
char_counts = Counter(all_chars)
char_vocab = sorted(char_counts.keys())  # Unique characters
char_to_idx = {char: idx for idx, char in enumerate(char_vocab, start=1)}

# Function to convert ciphertext to sequence of numbers
def text_to_sequence(text, max_len=100):
    seq = [char_to_idx.get(char, 0) for char in text]
    seq = seq[:max_len] + [0] * (max_len - len(seq))  # Pad/truncate to max_len
    return seq

# Convert all ciphertexts
X_train_seq = torch.tensor([text_to_sequence(text) for text in X_train], dtype=torch.long)
X_test_seq = torch.tensor([text_to_sequence(text) for text in X_test], dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)


In [4]:
class EncryptionDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoaders
batch_size = 32
train_dataset = EncryptionDataset(X_train_seq, y_train_tensor)
test_dataset = EncryptionDataset(X_test_seq, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
import torch.nn as nn

class CipherLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, dropout_prob=0.5):
        super(CipherLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        hidden = self.dropout(hidden[-1])  # Apply dropout before the fully connected layer
        out = self.fc(hidden)
        return out

# Model parameters
vocab_size = len(char_vocab) + 1  # Extra 1 for padding index
embed_dim = 128
hidden_dim = 256
num_classes = len(label_encoder.classes_)

# Initialize model
model = CipherLSTM(vocab_size, embed_dim, hidden_dim, num_classes, dropout_prob=0.5)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        # batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Print loss every epoch
    # if (epoch + 1) % 1 == 0:
    #     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")


Epoch [1/3], Loss: 0.6307
Epoch [2/3], Loss: 0.6224
Epoch [3/3], Loss: 0.6253


In [None]:
from sklearn.metrics import accuracy_score

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        # batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(batch_y.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.6550


In [20]:
import pickle

# Save the model
torch.save(model.state_dict(), "cipher_lstm_model.pth")

In [13]:
import pandas as pd

df2 = pd.read_csv("cryptography_dataset_enhanced.csv")
df2

Unnamed: 0,Plaintext,Ciphertext,Algorithm,Key
0,DdClLbN6qfFRtpVyP033zCjp0FndJ53O,9HtvDQw6d0hnjkkokNpaAa7aO4zxbvPEfqpKr3PwnnMo6x...,Blowfish,1a1ea0429141dc2bf9041d41f825553f
1,SJktYGKNee2MKBz1dsoyntoC0LrpmZOX,BDu9JxclhcofdpFkBRLTYVLD21dg2tqY/NFlN1CRpls=,RC4,ee1e67381c4dfbdef477a2b58726b3eb
2,32VKfI0uqZgPUmWgHgPFFcwDWG1ONHWM,603b2ef6a5f42bb1924fea4d22f1af6b49cf8abfaf385f...,SHA-256,
3,qKIpLiJrQ81sTRK2U1OSH0i6OpiBYDul,7cJcDg85U4BuuCEreK+C7LouVenzQHJ8FFII3Ei2yp3Ud8...,3DES,4d78801cb57f077cff9bfaddc0342a3dd9acde28d2decb80
4,7yOAGHJ0f2KfEqcyVce6tJ5FJmuhXkLC,LMhm3BjIcUb/1eqdlvRyI/rcGWpF/2xvkdNsCosd+hIKdi...,AES,3f3beaa14cfc3987dd370495a4956625
...,...,...,...,...
19995,onHpfzKuaNcCbjEIjqvJgBnAEx4kFncp,d9d76048703d7bf0bb06e565ecffe5ac316a2eb2c2d090...,SHA-256,
19996,OB7YwgGANOdsttBvIvXlVD2eBVudDUOp,yBys9z6ay2KyzPa7LZjzbtXAJiYqWbRoV7LJhZJIniDdV+...,DES,d48ca6012cc70ee6
19997,k4fkCoU1tcvXYuhoQERLRL6sfCRmojhh,1f4603851b0148a93e0b4fd020ace87314af5838ab959d...,SHA-256,
19998,5PiG6IMYkVCwNeU0TmdGIB6EZSE191Dq,jm8uRnVsW8cDTHIQM/hr/P3UsJjBrdxoFaLe6Aw8Zi4knn...,RSA,-----BEGIN PUBLIC KEY-----\nMIIBIjANBgkqhkiG9w...


In [14]:
df2 = df2[~df2['Algorithm'].isin(['AES', 'RSA', 'SHA-256', '3DES'])]

df2 = df2.dropna(subset=['Algorithm'])

print(df2['Algorithm'].value_counts())


Algorithm
Blowfish    2533
DES         2514
RC4         2510
ChaCha20    2479
Name: count, dtype: int64


In [15]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)


{'Blowfish': 0, 'ChaCha20': 1, 'DES': 2, 'ECC': 3, 'RC4': 4}


In [16]:
# Load dataset
X_test_features2 = df2["Ciphertext"]
y_test2 = df2["Algorithm"]

y_test2 = y_test2.map(label_mapping)

# Check for NaN values
if y_test2.isna().sum() > 0:
    print("Warning: Found NaN values in y_test2 after mapping!")
    print(y_test2[y_test2.isna()])  # Print missing values
    y_test2 = y_test2.dropna()  # Drop rows with missing values

# Ensure X_test_features2 has matching indices
X_test_features2 = X_test_features2.loc[y_test2.index]

X_test_seq2 = torch.tensor([text_to_sequence(text) for text in X_test_features2], dtype=torch.long)
y_test_tensor2 = torch.tensor(y_test2.astype(int).values, dtype=torch.long)

test_dataset2 = EncryptionDataset(X_test_seq2, y_test_tensor2)
test_loader2 = DataLoader(test_dataset2, batch_size=batch_size, shuffle=False)

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader2:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(batch_y.cpu().numpy())

# Compute accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.2501
