In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

label_encoder = LabelEncoder()
data = pd.read_csv("data/train_dl.csv")

def preprocess_text(text):
    return text

data["text"] = data["text"].apply(preprocess_text)
data["class"] = label_encoder.fit_transform(data["class"])

X_train, X_test, y_train, y_test = train_test_split(data["text"], data["class"], test_size=0.2, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_sequence_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

model = tf.keras.Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
    LSTM(64),
    Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=3)
model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.1)

y_test_encoded = tf.keras.utils.to_categorical(y_test, num_classes=3)
accuracy = model.evaluate(X_test_padded, y_test_encoded)[1]
print(f"Model accuracy: {accuracy * 100:.2f}%")