<a href="https://colab.research.google.com/github/2303A51856/NLP/blob/main/LAB_7_1856.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


df = pd.read_csv("/content/depression_dataset_reddit_cleaned.csv")

print(df.head())
print(df.columns)


texts = df['clean_text'].astype(str)
labels = df['is_depression']

                                          clean_text  is_depression
0  we understand that most people who reply immed...              1
1  welcome to r depression s check in post a plac...              1
2  anyone else instead of sleeping more when depr...              1
3  i ve kind of stuffed around a lot in my life d...              1
4  sleep is my greatest and most comforting escap...              1
Index(['clean_text', 'is_depression'], dtype='object')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = texts.apply(preprocess_text)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], labels, test_size=0.2, random_state=42, stratify=labels
)


In [8]:

tfidf_uni = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)


tfidf_bi = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_bi = tfidf_bi.fit_transform(X_train)
X_test_bi = tfidf_bi.transform(X_test)


tfidf_tri = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
X_train_tri = tfidf_tri.fit_transform(X_train)
X_test_tri = tfidf_tri.transform(X_test)


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_ann(input_dim):
    model = Sequential()
    model.add(Dense(256, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


ann_uni = build_ann(X_train_uni.shape[1])
history_uni = ann_uni.fit(X_train_uni, y_train, validation_data=(X_test_uni, y_test),
                          epochs=5, batch_size=64, verbose=1)


ann_bi = build_ann(X_train_bi.shape[1])
history_bi = ann_bi.fit(X_train_bi, y_train, validation_data=(X_test_bi, y_test),
                        epochs=5, batch_size=64, verbose=1)


ann_tri = build_ann(X_train_tri.shape[1])
history_tri = ann_tri.fit(X_train_tri, y_train, validation_data=(X_test_tri, y_test),
                          epochs=5, batch_size=64, verbose=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.8153 - loss: 0.5122 - val_accuracy: 0.9657 - val_loss: 0.1056
Epoch 2/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9752 - loss: 0.0749 - val_accuracy: 0.9599 - val_loss: 0.1109
Epoch 3/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9896 - loss: 0.0334 - val_accuracy: 0.9606 - val_loss: 0.1194
Epoch 4/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.9970 - loss: 0.0174 - val_accuracy: 0.9560 - val_loss: 0.1367
Epoch 5/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.9975 - loss: 0.0122 - val_accuracy: 0.9528 - val_loss: 0.1567
Epoch 1/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.7241 - loss: 0.5168 - val_accuracy: 0.9651 - val_loss: 0.1054
Epoch 2/5
[1m97/97[0m [32m━━━━━━━━━━━

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

def build_lstm():
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

lstm_model = build_lstm()
history_lstm = lstm_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test),
                              epochs=5, batch_size=64, verbose=1)


Epoch 1/5




[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 314ms/step - accuracy: 0.8505 - loss: 0.4302 - val_accuracy: 0.9683 - val_loss: 0.0940
Epoch 2/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 308ms/step - accuracy: 0.9740 - loss: 0.0764 - val_accuracy: 0.9735 - val_loss: 0.0794
Epoch 3/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 335ms/step - accuracy: 0.9841 - loss: 0.0479 - val_accuracy: 0.9670 - val_loss: 0.0902
Epoch 4/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 333ms/step - accuracy: 0.9879 - loss: 0.0344 - val_accuracy: 0.9638 - val_loss: 0.0911
Epoch 5/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 327ms/step - accuracy: 0.9924 - loss: 0.0210 - val_accuracy: 0.9722 - val_loss: 0.0930


In [11]:

print("ANN Unigram:", ann_uni.evaluate(X_test_uni, y_test, verbose=0))
print("ANN Bigram :", ann_bi.evaluate(X_test_bi, y_test, verbose=0))
print("ANN Trigram:", ann_tri.evaluate(X_test_tri, y_test, verbose=0))


print("LSTM:", lstm_model.evaluate(X_test_pad, y_test, verbose=0))


ANN Unigram: [0.15670284628868103, 0.9528118968009949]
ANN Bigram : [0.1579189896583557, 0.9521654844284058]
ANN Trigram: [0.16744530200958252, 0.9469941854476929]
LSTM: [0.09299895912408829, 0.9722042679786682]
