##using XGBoost

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv("datasets/crypto_dataset_large2.csv")

In [5]:
label_encoder = LabelEncoder()
df['algorithm_label'] = label_encoder.fit_transform(df['Algorithm'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['Ciphertext'], df['algorithm_label'], test_size=0.2, random_state=42)


In [7]:
df["Algorithm"].value_counts()

Algorithm
DES         20000
ChaCha20    20000
ECC         20000
Blowfish    20000
RC4         20000
Name: count, dtype: int64

In [8]:
import hashlib
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature extraction using character frequency and hashing
def extract_features(cipher_texts):
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 1))
    return vectorizer.fit_transform(cipher_texts)

X_train_features = extract_features(X_train)
X_test_features = extract_features(X_test)


In [9]:
X_train_features.shape, X_test_features.shape, y_train.shape, y_test.shape

((80000, 39), (20000, 39), (80000,), (20000,))

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
# Train model
model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train_features, y_train)

# model = KNeighborsClassifier(n_neighbors=100)
# model.fit(X_train_features, y_train)

# Predict
y_pred = model.predict(X_test_features)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


Model Accuracy: 0.5851
