In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GlobalAveragePooling1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

# Load the datasets
train_file_path = 'train.csv'  # Replace with your actual file path
test_file_path = 'test.csv'    # Replace with your actual file path

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Check data columns
print(train_data.columns)

# Preprocess data
X_train = train_data['Description'].values  # Replace 'description' with the correct column name
y_train = train_data['Title'].values        # Replace 'label' with the correct column name
X_test = test_data['Description'].values
y_test = test_data['Title'].values

# Tokenize the descriptions
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_len = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Encode labels
label_encoder = LabelEncoder()
num_classes = train_data['Title'].nunique() # Get the number of unique classes in the 'Title' column
label_encoder.fit(pd.concat([train_data['Title'], test_data['Title']]))  # Fit on training labels only
num_classes = len(label_encoder.classes_)

y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create word embeddings using a word2vec model (optional, uses pre-trained embeddings)
embedding_dim = 100
embedding_matrix = np.random.uniform(-1, 1, (len(word_index) + 1, embedding_dim))

# Build the model
model = Sequential([
    Embedding(len(word_index) + 1, embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=True),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')  # Use num_classes instead of a fixed number
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train_encoded, validation_data=(X_test_padded, y_test_encoded), epochs=10, batch_size=32)

# Evaluate the model
train_loss, train_acc = model.evaluate(X_train_padded, y_train_encoded, verbose=0)
test_loss, test_acc = model.evaluate(X_test_padded, y_test_encoded, verbose=0)

print(f"Training Accuracy: {train_acc}")
print(f"Test Accuracy: {test_acc}")

# Generate a classification report
y_pred = np.argmax(model.predict(X_test_padded), axis=-1)
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))


Index(['Class Index', 'Title', 'Description'], dtype='object')




Epoch 1/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1379s[0m 367ms/step - accuracy: 1.5836e-04 - loss: 11.8830 - val_accuracy: 0.0011 - val_loss: 12.7754
Epoch 2/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1413s[0m 370ms/step - accuracy: 8.9820e-04 - loss: 11.7671 - val_accuracy: 6.5789e-04 - val_loss: 14.2474
Epoch 3/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1380s[0m 364ms/step - accuracy: 9.9229e-04 - loss: 11.5053 - val_accuracy: 0.0013 - val_loss: 15.5045
Epoch 4/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1360s[0m 363ms/step - accuracy: 0.0024 - loss: 10.5495 - val_accuracy: 0.0016 - val_loss: 16.0634
Epoch 5/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1473s[0m 393ms/step - accuracy: 0.0146 - loss: 8.4196 - val_accuracy: 0.0041 - val_loss: 17.9263
Epoch 6/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1425s[0m 367ms/step - accuracy: 0.1645 - loss: 5.2417 - v