In [1]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
train_path = '../data/train.csv'
test_path = '../data/test.csv'

In [9]:
train_data = pd.read_csv(train_path)
train_data.describe()

Unnamed: 0,text,label
count,34823,34823
unique,34772,5
top,FEARLESS FRIDAYS MEGA THREAD. Here we discuss ...,self.depression
freq,13,11940


In [10]:
test_data = pd.read_csv(test_path)
test_data.describe()

Unnamed: 0,text,label
count,10883,10883
unique,10881,5
top,Reminiscing the past Life used to be so great ...,self.depression
freq,2,3774


In [ ]:
train_data['text'] = train_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)

In [ ]:
# Encode labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert text to sequences and pad them
max_length = max([len(s.split()) for s in train_data['text']])
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')

test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [ ]:
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_length),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [ ]:
model.fit(train_padded, train_data['label'], epochs=10, validation_split=0.2)

In [ ]:
loss, accuracy = model.evaluate(test_padded, test_data['label'])
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')