In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

2024-11-11 11:18:47.156494: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-11 11:18:47.398005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-11 11:18:47.484513: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-11 11:18:47.509901: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-11 11:18:47.713334: I tensorflow/core/platform/cpu_feature_guar

In [2]:
df = pd.read_csv('data_for_models.csv', index_col=0)
pd.set_option('display.max_columns', None)

In [4]:
# Convert feature columns to numeric, handling non-numeric values and filling NaNs with 0
X = df.drop(columns=['display_name']).apply(pd.to_numeric, errors='coerce').fillna(0).values

# Extract target labels
y = df['display_name'].values

# Encode target labels into integers for classification
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [5]:
# Split the data into training and testing sets with stratified sampling to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
# Apply SMOTE to balance the training dataset by oversampling the minority classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardize the features in the training and test sets using StandardScaler
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Convert the target labels to one-hot encoding for use in neural network models
y_train_onehot = tf.keras.utils.to_categorical(y_train_resampled)
y_test_onehot = tf.keras.utils.to_categorical(y_test)

In [None]:
# Define and train a deep neural network model on the GPU
with tf.device('/GPU:0'):
    model = Sequential([
        Dense(1024, activation='swish', input_shape=(X_train_resampled.shape[1],)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(512, activation='swish'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(256, activation='swish'),
        BatchNormalization(),
        Dropout(0.1),
        Dense(132, activation='swish'),
        BatchNormalization(),
        Dropout(0.1),
        Dense(64, activation='swish'),
        BatchNormalization(),
        Dropout(0.1),
        Dense(32, activation='swish'),
        BatchNormalization(),
        Dropout(0.1),
        Dense(16, activation='swish'),
        BatchNormalization(),
        Dropout(0.1),
        Dense(y_train_onehot.shape[1], activation='softmax')  # Output layer with softmax activation for multi-class classification
    ])

# Compile the model with Adam optimizer, categorical crossentropy loss, and accuracy metric
model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting, with patience for 10 epochs and restoring best weights
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

# Train the model using the resampled training data and validate on the test set
history = model.fit(
    X_train_resampled, y_train_onehot,
    validation_data=(X_test, y_test_onehot),
    epochs=100,
    batch_size=512,
    verbose=1,
    callbacks=[early_stopping]
)


In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
conf_matrix = confusion_matrix(np.argmax(y_test_onehot, axis=1), y_pred_classes)
print("Confusion Matrix:")
print(conf_matrix)

In [9]:
classification_report_text = classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_)

In [10]:
print(classification_report_text)

              precision    recall  f1-score   support

        BS 1       0.63      0.67      0.65      7976
        BS 2       0.46      0.30      0.36      4135
        BS 3       0.52      0.62      0.56      3004
        BS 4       0.46      0.43      0.44      2214
        BS 5       0.11      0.40      0.17        45
        BS 6       0.58      0.69      0.63       414
        BS 8       0.63      0.55      0.58      2043
     BS Mini       0.75      0.63      0.69      2615
        Groß       0.58      0.57      0.57      4869
       Klein       0.62      0.80      0.70      3316
    Original       0.33      0.40      0.36         5

    accuracy                           0.59     30636
   macro avg       0.51      0.55      0.52     30636
weighted avg       0.58      0.59      0.58     30636

