In [1]:
pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.16.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Using cached h5py-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadat

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import hashlib
from collections import Counter

# Dictionary to store device hashes
device_hashes = {}

# Function to generate a SHA-256 hash for the given device label
def generate_hash(device_label):
    if device_label not in device_hashes:
        hashed_value = hashlib.sha256(str(device_label).encode()).hexdigest()
        device_hashes[device_label] = hashed_value
    return device_hashes[device_label]

# Function to read and preprocess data from a file
def read_and_preprocess_data(file_path, has_labels=True):
    X = []
    y = []
    encoding = 'utf-8-sig' if not has_labels else 'utf-8'  # Use 'utf-8-sig' for new_data if necessary
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            parts = line.strip().split(' ')
            features = np.array([int(bit) for bit in parts[0]], dtype=int)
            X.append(features)
            if has_labels and len(parts) > 1:
                y.append(int(parts[1]))
    X = np.array(X)
    if has_labels:
        y = np.array(y)
        num_classes = np.unique(y).size
        y_categorical = keras.utils.to_categorical(y - 1, num_classes=num_classes)
        return X, y_categorical, num_classes
    return X, None, None

# Function to create the model
def create_model(input_shape, num_classes, learning_rate, dropout_rate):
    model = keras.Sequential([
        keras.Input(shape=(input_shape,)),
        layers.Reshape((input_shape, 1)),  # Needed for Conv1D
        layers.Conv1D(32, kernel_size=3, activation='relu', padding='same'),
        layers.Dropout(dropout_rate),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Training the model
print("Training the model...")
X_train, y_train_categorical, num_classes = read_and_preprocess_data('DoraHack/training_data.txt', has_labels=True)
input_shape = X_train.shape[1]
best_accuracy = 0
best_params = {}


learning_rates = [0.01, 0.001, 0.0001]  # Different learning rates to try
dropout_rates = [0.3, 0.5, 0.7]  # Different dropout rates to try

for lr in [0.01, 0.001, 0.0001]:
    for dr in [0.3, 0.5, 0.7] :
        model = create_model(input_shape, num_classes, lr, dr)
        history = model.fit(X_train, y_train_categorical, epochs=10, validation_split=0.1, verbose=0)
        val_accuracy = max(history.history['val_accuracy'])
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_params = {'learning_rate': lr, 'dropout_rate': dr}
            model.save('best_model.h5')

print("Best validation accuracy:", best_accuracy)
print("Best parameters:", best_params)

Training the model...




Best validation accuracy: 0.8633333444595337
Best parameters: {'learning_rate': 0.001, 'dropout_rate': 0.5}


In [9]:
# Predicting new data
print("Predicting new data...")
model = keras.models.load_model('DoraHack/bestmodel.h5')
X_new, _, _ = read_and_preprocess_data('DoraHack/test_1_simulator.txt', has_labels=False)
predictions = model.predict(X_new)
predicted_labels = np.argmax(predictions, axis=1) + 1  # Labels are 1-based

# Count the occurrences of each label and calculate percentages
total_samples = len(predicted_labels)
label_counts = Counter(predicted_labels)
print("Label counts and percentages:")
for label, count in label_counts.items():
    percentage = (count / total_samples) * 100
    hash_address = generate_hash(str(label))
    print(f"Label {label} appears {count} times ({percentage:.2f}%), Blockchain Address: {hash_address}")



Predicting new data...
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Label counts and percentages:
Label 3 appears 4338 times (54.23%), Blockchain Address: 4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce
Label 1 appears 3627 times (45.34%), Blockchain Address: 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b
Label 2 appears 35 times (0.44%), Blockchain Address: d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35
