In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
import pickle

2024-06-07 10:35:07.474396: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the dataset
data = pd.read_csv('../dataset/processed_data/new_dataset.csv')

# Extract addresses and labels
addresses = data['address'].values
labels = data['type'].values

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenize the addresses
tokenizer = Tokenizer(char_level=True)  # Tokenize at the character level
tokenizer.fit_on_texts(addresses)
sequences = tokenizer.texts_to_sequences(addresses)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Convert labels to categorical format
categorical_labels = to_categorical(encoded_labels)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, categorical_labels, test_size=0.2, random_state=42)

# Get the number of unique classes
num_classes = len(label_encoder.classes_)

In [3]:
# Convert labels to categorical format
categorical_labels = to_categorical(encoded_labels, num_classes=num_classes)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Conv1D(64, 3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))  # Adjust num_classes to the number of classes in your task

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [4]:
# Train the model
history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.2)

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1578s[0m 157ms/step - accuracy: 0.9812 - loss: 0.0441 - val_accuracy: 1.0000 - val_loss: 5.1142e-07


In [5]:
# Save the model
model.save('../models/crypto_address_classifier_epoch1_newv2.h5')



In [6]:
# Save the tokenizer
with open('../models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoder
with open('../models/label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save max_length to a text file
with open('../models/max_length.txt', 'w') as f:
    f.write(str(max_length))


In [7]:
print(max_length)

95


In [8]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Test Loss: {test_loss:.4f}')

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 45ms/step - accuracy: 1.0000 - loss: 4.8779e-07
Test Accuracy: 1.0000
Test Loss: 0.0000


In [9]:
# Load the trained model
model = load_model('../models/crypto_address_classifier_epoch1_newv2.h5')

# Sample new addresses for inference
new_addresses = ["1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa", 
                 "0x9cc6c2b37428c258a0783d29c3cc05c1cc8348a9", 
                 "XkMSxZA88p3k6JN3XMcgZh53HjVzboSUuh",
                 "TTqtSdSUfNMAxQzaCzNf7ULZEGLQXMZx0H",
                 "85kay29yxusnlc8ebh71bbsl8mpvnlc42fbyza1e90b0zwbf68g67efw4m7gjbitec8yrgc23w7lifqogub1hnsoppp4161",
                ]

# Preprocess the new addresses
new_sequences = tokenizer.texts_to_sequences(new_addresses)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length, padding='post')

# Predict the classes for the new addresses
predictions = model.predict(new_padded_sequences)
predicted_classes = predictions.argmax(axis=-1)

# Convert indices to class labels
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Print the results
for address, label in zip(new_addresses, predicted_labels):
    print(f"Address: {address}, Predicted Class: {label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 520ms/step
Address: 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa, Predicted Class: bitcoin
Address: 0x9cc6c2b37428c258a0783d29c3cc05c1cc8348a9, Predicted Class: ethereum
Address: XkMSxZA88p3k6JN3XMcgZh53HjVzboSUuh, Predicted Class: dash
Address: TTqtSdSUfNMAxQzaCzNf7ULZEGLQXMZx0H, Predicted Class: Tron
Address: 85kay29yxusnlc8ebh71bbsl8mpvnlc42fbyza1e90b0zwbf68g67efw4m7gjbitec8yrgc23w7lifqogub1hnsoppp4161, Predicted Class: Monero


In [11]:
import re

# Define regex patterns for each cryptocurrency, including the updated Bitcoin, Tron, and Monero patterns
regex_patterns = {
    'bitcoin': r'^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$|^bc1[ac-hj-np-z02-9]{38,59}$',
    'ethereum': r'^0x[a-fA-F0-9]{40}$',
    'dash': r'^X[1-9A-HJ-NP-Za-km-z]{33}$',
    'tron': r'^T[a-zA-Z0-9]{33}$',
    'monero': r'^[48][a-z0-9]{94}$'
}

# Function to perform inference and compare with regex
def predict_and_validate(addresses, model, tokenizer, label_encoder, max_length):
    # Preprocess the addresses
    sequences = tokenizer.texts_to_sequences(addresses)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Predict the classes for the addresses
    predictions = model.predict(padded_sequences)
    predicted_classes = predictions.argmax(axis=-1)
    predicted_labels = label_encoder.inverse_transform(predicted_classes)

    results = []

    for address, predicted_label in zip(addresses, predicted_labels):
        # Convert the predicted label to lowercase to match the regex dictionary
        predicted_label_lower = predicted_label.lower()

        # Check the address with regex patterns
        matched_label = None
        for label, pattern in regex_patterns.items():
            if re.match(pattern, address):
                matched_label = label.lower()
                break

        # Compare model prediction with regex match
        if matched_label == predicted_label_lower:
            results.append((address, predicted_label))
        else:
            results.append((address, 'unknown'))
        print(predicted_label)

    return results

# Example usage
new_addresses = [
    "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa",
    "bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080",
    "0x9cc6c2b37428c258a0783d29c3cc05c1cc8348a9",
    "T9K9nB5uMdX8dW5eYfH2Q5s7gE6rV1eZ2b",
    "4e8eymr4yxgbl889el2hfbvpwpvl301r40wy3e98ofp61y4mabn57vikg6okxwwk0ics16ya9ycpzwp6cs3zuz4fhpc9fwz",
    "85kay29yxusnlc8ebh71bbsl8mpvnlc42fbyza1e90b0zwbf68g67efw4m7gjbitec8yrgc23w7lifqogub1hnsoppp4161"
]

# Load the trained model
model = load_model('../models/crypto_address_classifier_epoch1_newv2.h5')

# Predict and validate
results = predict_and_validate(new_addresses, model, tokenizer, label_encoder, max_length)

# Print the results
for address, label in results:
    print(f"Address: {address}, Predicted Class: {label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 794ms/step
bitcoin
bitcoin
ethereum
Tron
Monero
Monero
Monero
Address: 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa, Predicted Class: bitcoin
Address: bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kygt080, Predicted Class: bitcoin
Address: 0x9cc6c2b37428c258a0783d29c3cc05c1cc8348a9, Predicted Class: ethereum
Address: T9K9nB5uMdX8dW5eYfH2Q5s7gE6rV1eZ2b, Predicted Class: Tron
Address: 4e8eymr4yxgbl889el2hfbvpwpvl301r40wy3e98ofp61y4mabn57vikg6okxwwk0ics16ya9ycpzwp6cs3zuz4fhpc9fwz, Predicted Class: Monero
Address: 85kay29yxusnlc8ebh71bbsl8mpvnlc42fbyza1e90b0zwbf68g67efw4m7gjbitec8yrgc23w7lifqogub1hnsoppp4161, Predicted Class: Monero
Address: 84EgZVjXKF4d1JkEhZSxm4LQQEx64AvqQEwkvWPtHEb5JMrB1Y86y1vCPSCiXsKzbfS9x8vCpx3gVgPaHCpobPYqQzANTnC, Predicted Class: unknown
