In [None]:
!pip install spacy langdetect wordcloud --no-deps


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
import re
import string
import wordcloud
from langdetect import detect
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import words

## EDA

### Read the Data

In [None]:
data_path = '/kaggle/input/suicide-watch/Suicide_Detection.csv'
df = pd.read_csv(data_path)

In [None]:
df.head(20)

### Data Summary

In [None]:
print("\nSummary Statistics:")
print(df.describe())

### Check for missing value

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())

### Count the class distribution

In [None]:
df['class'].value_counts()

In [None]:
if 'class' in df.columns:
    sns.countplot(x='class', data=df)
    plt.title('Distribution of Labels')
    plt.show()

### Check for any duplicates rows

In [None]:
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")


### Display word cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

all_text = ' '.join(df['text'].dropna().astype(str))

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(all_text)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Text Data')
plt.show()


## Data Preprocessing

### Lowercase all the text

In [None]:
df['text'] = df['text'].str.lower()

In [None]:
df.head(20)

### Replace "’" with "'"

In [None]:
df['text'] = df['text'].str.replace("’", "'")

In [None]:
df.head(20)

### Replace the abbreviations

In [None]:
abb = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "dont": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "idk": "i do not know",
  "he'd've": "he would have",
  "he'll": "he will",  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "i'd": "i would",
  "i'd've": "i would have",
  "i'll": "i will",
  "i'll've": "i will have",
  "i'm": "i am",
  "im": "i am",
  "i've": "i have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is", "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have", "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",      "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have", "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have", "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}



In [None]:
abb_re = re.compile('(%s)' % '|'.join(abb.keys()))

def expandContractions(text, abb_re=abb_re):
    def replace(match):
        return abb[match.group(0)]
    return abb_re.sub(replace, text)

df['text'] = df['text'].apply(expandContractions)

In [None]:
df.head(20)

### Tokenization

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Function to tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

df['text'] = df['text'].apply(tokenize_text)

In [None]:
df.head(20)

### Word Segementation

In [None]:
!pip install wordninja

In [None]:
import pandas as pd
import wordninja
import re

# Apply word segmentation tot he 'text' column in the DataFrame
df['text'] = df['text'].apply(lambda tokens: wordninja.split(" ".join(tokens)))

### Stopwords Removal

In [None]:
from nltk.corpus import stopwords

# Download the list of English stopwords
nltk.download('stopwords')

# Retrieve the English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

# Apply the stopword removal function to the 'text' column
df['text'] = df['text'].apply(remove_stopwords)



In [None]:
df.head(10)

### Punctuation and Digit Removal

In [None]:
import string

# Function to remove punctuation and digits
def remove_punctuation(tokens):
    return [word for word in tokens if word not in string.punctuation and not word.isdigit()]

# Apply the function to remove punctuation and digits from the 'text' column
df['text'] = df['text'].apply(remove_punctuation)

# Display the first few rows of the 'text' column after removal
print(df['text'].head())


In [None]:
df.head(20)

### Special Characters Removal

In [None]:
def remove_special_characters(tokens):
    # Remove special characters from each token using a regular expression
    return [re.sub(r'[^a-zA-Z0-9\s]', '', word) for word in tokens]

# Apply the function to remove special characters
df['text'] = df['text'].apply(remove_special_characters)

In [None]:
df.head(10)

### Extra white spaces Removal

In [None]:
# Function to remove extra whitespaces
def remove_extra_whitespaces(tokens):
    # Strip leading and trailing whitespaces and filter out empty strings
    return [word.strip() for word in tokens if word.strip() != '']

df['text'] = df['text'].apply(remove_extra_whitespaces)


In [None]:
df.head(20)

### Emoji, mail, and url removal

In [None]:
import re

# Function to remove URLs 
def remove_url(tokens):
    cleaned_tokens = [re.sub(r'http\S+', '', token) for token in tokens]
    return cleaned_tokens

# Function to remove email addresses 
def remove_mail(tokens):
    cleaned_tokens = [re.sub(r'\S+@\S+', '', token) for token in tokens]
    return cleaned_tokens

# Function to remove emojis 
def remove_emoji(tokens):
    cleaned_tokens = [re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001FB00-\U0001FBFF\U0001FE00-\U0001FE0F\U0001F004]+', '', token) for token in tokens]
    return cleaned_tokens

df['text'] = df['text'].apply(remove_url)
df['text'] = df['text'].apply(remove_mail)
df['text'] = df['text'].apply(remove_emoji)

In [None]:
df.head(20)

### Text Lemmatization

In [None]:
# Load the spaCy English model
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatize_tokens(tokens):
    # Join the tokens back into a sentence
    text = ' '.join(tokens)
    # Process the text using spaCy
    doc = nlp(text)
    # Lemmatize each token and return the lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

# Apply lemmatization t
df['text'] = df['text'].apply(lemmatize_tokens)

In [None]:
df.head(20)

### Remove non-English Words

we keep some of the words that maybe not include in english word of nltk that we think important for the model features

In [None]:
from nltk.corpus import words
import nltk

# Download the list of English words (if not already downloaded)
nltk.download('words')

# Load the set of English words
english_words = set(words.words())

# List of words to exclude from removal
words_to_exclude =  {
    'fuck','suicidal', 'depressed', 'anxiety', 'selfharm', 'overdose', 'hopeless',
    'cutting', 'lifeless', 'worthless', 'painful', 'enditall', 'sadness',
    'goodbye', 'helpme', 'hurtmyself', 'numb', 'alone', 'darkness', 'dying',
    'unloved', 'lost', 'killmyself', 'hanging', 'drugs', 'triggered',
    'relapse', 'cut', 'sh', 'kms', 'plshelp', 'wanttodie', 'tiredoflife'
}

# Function to remove non-English words from a list of tokens
def remove_non_english(tokens):
    english_tokens = [
        token if (token in english_words or token in words_to_exclude) else ''
        for token in tokens]
    return [token for token in english_tokens if token != '']

# Apply the function to the 'text' column in the DataFrame
df['text'] = df['text'].apply(remove_non_english)

In [None]:
df.head(20)

## Modelling

### Tokenization with keras

In [None]:
pip install gensim --no-deps


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Dropout, Dense
from keras.layers import GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional, Input

# Configuration
EMBEDDING_DIM = 100  # Dimension of word embeddings
MAX_NUM_WORDS = 10000  # Maximum number of unique words
MAX_SEQUENCE_LENGTH = 200  # Fixed sequence length for all input data

# Combine tokens back into a string to process with the Tokenizer
df['text'] = df['text'].apply(lambda tokens: " ".join(tokens))

# Tokenizer: Train and convert text to sequences
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True)  # Limit vocabulary size and convert to lowercase
tokenizer.fit_on_texts(df['text'])  # Learn the vocabulary from the text data
X = tokenizer.texts_to_sequences(df['text'])  # Convert text to numerical sequences

# Pad sequences to ensure uniform length
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH, padding='post')  # Add padding at the end of sequences


In [None]:
df.head(20)

### Convert label to one-hot encoding

In [None]:
# Convert labels to one-hot encoding
y = pd.get_dummies(df['class']).values  # Transform class labels into a binary matrix

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# `test_size=0.3` allocates 30% of the data for testing, 70% for training
# `random_state=42` ensures reproducibility of the split



### Word Embedding

In [None]:
# Download and load pre-trained GloVe embeddings (100 dimensions)
glove_gensim = api.load('glove-wiki-gigaword-100')

# Create a weight matrix for the embedding layer
gensim_weight_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))  # Initialize with zeros

# Populate the weight matrix with GloVe vectors for words in the tokenizer's vocabulary
for word, index in tokenizer.word_index.items():
    if index < MAX_NUM_WORDS:  # Ensure the index is within the specified maximum word limit
        if word in glove_gensim.index_to_key:  # Check if the word exists in the GloVe vocabulary
            gensim_weight_matrix[index] = glove_gensim[word]  # Assign the GloVe vector
        else:
            gensim_weight_matrix[index] = np.zeros(EMBEDDING_DIM)  # Assign a zero vector if the word is not found


### Model Building

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Input, BatchNormalization

# Build the model
model = Sequential()

# Input layer
model.add(Input(shape=(X.shape[1],)))

# Embedding layer with pre-trained GloVe embeddings
model.add(Embedding(input_dim=MAX_NUM_WORDS,
                    output_dim=EMBEDDING_DIM,
                    weights=[gensim_weight_matrix],
                    trainable=False)) 

# Dropout layer
model.add(Dropout(0.2))

# GRU layer 1
model.add(GRU(100, return_sequences=True))

# Dropout layer
model.add(Dropout(0.2))

# GRU layer 2
model.add(GRU(100, return_sequences=True))

# Dropout layer
model.add(Dropout(0.2))

# GRU layer 3
model.add(GRU(100, return_sequences=False))

# BatchNormalization layer
model.add(BatchNormalization())

# Dropout layer
model.add(Dropout(0.3))

# Dense layer
model.add(Dense(64, activation='relu'))

# Dropout layer
model.add(Dropout(0.3))

# Output layer
model.add(Dense(y.shape[1], activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# EarlyStopping and ModelCheckpoint
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('./best_model.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

# Display model summary
model.summary()


### Train the Model

In [None]:
# Model Training
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=20,
                    batch_size=64,
                    callbacks=[es, mc],
                    verbose=1)

# Model Evaluation
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

### Model Evaluation

In [None]:
import matplotlib.pyplot as plt

# Plot for accuracy
plt.figure(figsize=(12, 5))

# Subplot for accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')  # Training accuracy
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')  # Validation accuracy
plt.title('Accuracy per Epoch')  # Title of the plot
plt.xlabel('Epochs')  # X-axis label
plt.ylabel('Accuracy')  # Y-axis label
plt.legend()  # Display legend

# Subplot for loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')  # Training loss
plt.plot(history.history['val_loss'], label='Validation Loss')  # Validation loss
plt.title('Loss per Epoch')  # Title of the plot
plt.xlabel('Epochs')  # X-axis label
plt.ylabel('Loss')  # Y-axis label
plt.legend()  # Display legend

# Display the plot
plt.tight_layout()  # Adjust layout for better fit
plt.show()  # Show the plots


In [None]:
from sklearn.metrics import classification_report

# Get model predictions
y_pred = model.predict(X_test)

# Convert predictions and labels from one-hot encoding to integer labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Generate the classification report
report = classification_report(y_test_labels, y_pred_labels, target_names=df['class'].unique())

# Print the classification report
print(report)
