# Define the location for the data set and Glove for embedding

In [1]:
# The folder of the data set
dataset_path = '../data/raw/suicidal_detection/'
# The folder for the models
model_path = '../models'
# The folder where Glove is installed
TORCHNLP_CACHEDIR = f'{model_path}/Glove/pytorch-nlp_data'

# Define the Global Variables

In [None]:
isTrain = True
seed = 2
data_path = f"{dataset_path}/train/dialogues_train.txt"
train_preprocess = f"{dataset_path}/train/dialogues_train_preprocess.pkl"
dev_preprocess = f"{dataset_path}/validation/dialogues_validation_preprocess.pkl"
test_preprocess = f"{dataset_path}/test/dialogues_test_preprocess.pkl"
model_save_path = f"{model_path}/dailyDialog/model_test_new_code.pt"
batch_size = 16
embedding_size = 300
lstm_hidden_size = 500
hidden_layer_size = 512
learning_rate = 0.001
epochs = 3
num_classes = 7

# Imports

In [None]:
!pip install pytorch-nlp --quiet
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --quiet
!pip install scikit-learn --quiet
!pip install pyspellchecker --quiet
!pip install contractions --quiet
!pip install beautifulsoup4 --quiet
!pip install emoji --quiet
!pip install matplotlib --quiet

In [None]:
from typing import List, Tuple, Callable, Dict
import argparse
import random
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import torch.autograd
import torch.optim as optim
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, classification_report, confusion_matrix)
from torch.utils.data import DataLoader
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence, pad_packed_sequence
from torchnlp.word_to_vector import GloVe
import contractions
import unicodedata
from bs4 import BeautifulSoup
import emoji
import re
from spellchecker import SpellChecker
import pickle
import os
import matplotlib.pyplot as plt
import threading

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# define Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# define Glove
pretrained_wv = GloVe(cache=TORCHNLP_CACHEDIR)
# Stopword removal
stop_words = set(stopwords.words('english'))
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
print(DEVICE)

# Helper functions

In [None]:
def dump_tuple(filename: str, data: tuple) -> None:
    '''
    Dump the tuple to a file.
    :param filename: The name of the file to dump the tuple to.
    :type filename: str
    :param data: The tuple to dump.
    :type data: tuple
    '''
    with open(filename, 'wb') as file:
        pickle.dump(data, file)


def load_tuple(filename: str) -> tuple:
    '''
    Load the tuple from the file.
    :param filename: The name of the file to load the tuple from.
    :type filename: str
    :return: The loaded tuple.
    :rtype: tuple
    '''
    with open(filename, 'rb') as file:
        return pickle.load(file)

# Preparing the Data set

In [None]:
def lower_sentence(sentence: str) -> str:
    '''
    Lowercase the sentence.
    :param data: The sentence to lowercase.
    :return: The lowercased sentence
    :rtype: str
    '''
    return sentence.lower()

In [None]:
def remove_emails(sentence: str) -> str:
    '''
    Remove emails from the sentence.
    :param sentence: The sentence to remove emails from.
    :type sentence: str
    :return: The sentence without emails.
    :rtype: str
    '''
    return re.sub(r"\S*@\S*\s?", "", sentence)

In [None]:
def remove_nonascii_diacritic(sentence: str) -> str:
    '''

    Remove diacritics from the sentence.

    :param sentence: The sentence to remove diacritics from.

    :type sentence: str

    :return: The sentence without diacritics.

    :rtype: str
    '''

    return unicodedata.normalize("NFKD", sentence).encode("ascii", "ignore").decode("utf-8", "ignore")

In [None]:
def clean_html(sentence: str) -> str:
    '''
    Remove HTML tags from the sentence.
    :param sentence: The sentence to remove HTML tags from.
    :type sentence: str
    :return: The sentence without HTML tags.
    :rtype: str
    '''
    return BeautifulSoup(sentence, "html.parser").get_text()

In [None]:
def replace_repeated_chars(sentence: str) -> str:
    '''
    Replace repeated characters in the sentence.
    :param sentence: The sentence to replace repeated characters in.
    :type sentence: str
    :return: The sentence with replaced repeated characters.
    :rtype: str
    '''
    # Replace consecutive occurrences of ',', '!', '.', and '?' with a single occurrence
    return re.sub(r'([,!?.])\1+', r'\1', sentence)

In [None]:
def translate_emojis_to_text(sentence: str) -> str:
    '''
    Translate emojis in the sentence to text.
    :param sentence: The sentence to translate emojis to text.
    :type sentence: str
    :return: The sentence with translated emojis to text.
    :rtype: str
    '''
    # Translate emojis to text codes
    translated_text = emoji.demojize(sentence)
    # Remove colons from the translated text
    translated_text = re.sub(r':', '', translated_text)
    return translated_text

In [None]:
def expand_sentence(sentence: str) -> str:
    '''
    Expand the contractions in the sentence.
    :param sentence: The sentence to expand contractions in.
    :type sentence: str
    :return: The sentence with expanded contractions.
    :rtype: str
    '''
    return contractions.fix(sentence)

In [None]:
def remove_url(sentence: str) -> str:
    '''
    Remove URLs from the sentence.
    :param sentence: The sentence to remove URLs from.
    :type sentence: str
    :return: The sentence without URLs.
    :rtype: str
    '''
    return re.sub("((http\://|https\://|ftp\://)|(www.))+(([a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(/[a-zA-Z0-9%:/-_\?\.'~]*)?", '', sentence)

In [None]:
def remove_possessives(sentence: str) -> str:
    '''
    Strip possessives from the sentence.
    :param sentence: The sentence to strip possessives from.
    :type sentence: str
    :return: The sentence without possessives.
    :rtype: str
    '''
    # Stripping the possessives
    sentence = sentence.replace("'s", '')
    sentence = sentence.replace('’s', '')
    sentence = sentence.replace('s’', 's')
    sentence = sentence.replace("s'", 's')
    return sentence

In [None]:
def remove_extra_space(sentence: str) -> str:
    '''
    Remove extra spaces from the sentence.
    :param sentence: The sentence to remove extra spaces from.
    :type sentence: str
    :return: The sentence without extra spaces.
    :rtype: str
    '''
    return re.sub(r'\s+', ' ', sentence).strip()

In [None]:
def check_sentence_spelling(sentence: list[str]) -> list[str]:
    '''
    Check the spelling of the words in the sentence.
    :param sentence: The sentence to check the spelling of.
    :type sentence: list
    :return: The sentence with corrected spelling.
    :rtype: list
    '''
    spell = SpellChecker()
    corrected_sentence = []
    for word in sentence:
        if word != '':
            correction = spell.correction(word)
            if correction is not None:
                corrected_sentence.append(correction)
            else:
                corrected_sentence.append(word)
        else:
            corrected_sentence.append('')
    return corrected_sentence

In [None]:
def tokenize_sentence(sentence: str) -> list[str]:
    '''
    Tokenize the sentence.
    :param sentence: The sentence to tokenize.
    :type sentence: str
    :return: The tokenized sentence.
    :rtype: str
    '''
    return nltk.word_tokenize(sentence)

In [None]:
def remove_stop_words(sentence: list[str]) -> list[str]:
    '''
    Remove stop words from the sentence.
    :param sentence: The sentence to remove stop words from.
    :type sentence: list[str]
    :return: The sentence without stop words.
    :rtype: list[str]
    '''
    return [word for word in sentence if word not in stop_words]

In [None]:
def lemm_sentence(sentence: list[str]) -> list[str]:
    '''
    Lemmatize the sentence.
    :param sentence: The sentence to lemmatize.
    :type sentence: list[str]
    :return: The lemmatized sentence.
    :rtype: list[str]
    '''
    # Perform POS tagging
    pos_tags = pos_tag(sentence)
    # Lemmatize each word based on its POS tag
    lemmatized_words = []
    for word, pos in pos_tags:
        # Map Penn Treebank POS tags to WordNet POS tags
        if pos.startswith('N'):  # Nouns
            pos = 'n'
        elif pos.startswith('V'):  # Verbs
            pos = 'v'
        elif pos.startswith('J'):  # Adjectives
            pos = 'a'
        elif pos.startswith('R'):  # Adverbs
            pos = 'r'
        else:
            pos = 'n'  # Default to noun if POS tag not found

        # Lemmatize the word using the appropriate POS tag
        lemma = lemmatizer.lemmatize(word, pos=pos)
        lemmatized_words.append(lemma)
    return lemmatized_words

In [None]:
def clean_train(line: str) -> list[str]:
    '''
    Clean the line and return it as a list of tokens
    :param line: the line to clean
    :type line: str
    :return: the cleaned line as a list of tokens
    :rtype: list
    '''
    # translate emojis
    line = translate_emojis_to_text(line)
    # lower the line
    line = lower_sentence(line)
    # remove non ascii
    line = remove_nonascii_diacritic(line)
    # remove emails
    line = remove_emails(line)
    # remove html
    line = clean_html(line)
    # remove urls
    line = remove_url(line)
    # replace repeated chars
    line = replace_repeated_chars(line)
    # expand
    line = expand_sentence(line)
    # remove possessives
    line = remove_possessives(line)
    # remove extra spaces
    line = remove_extra_space(line)
    # tekonize
    line = tokenize_sentence(line)
    # remove stopwords
    line = remove_stop_words(line)
    # lemmetization
    line = lemm_sentence(line)
    if len(line) == 0:
        return ['Normal']
    return line

In [None]:
def clean(line: str) -> list[str]:
    '''
    Clean the line and return it as a list of tokens
    :param line: the line to clean
    :type line: str
    :return: the cleaned line as a list of tokens
    :rtype: list
    '''
    # translate emojis
    line = translate_emojis_to_text(line)
    # lower the line
    line = lower_sentence(line)
    # remove non ascii
    line = remove_nonascii_diacritic(line)
    # remove emails
    line = remove_emails(line)
    # remove html
    line = clean_html(line)
    # remove urls
    line = remove_url(line)
    # replace repeated chars
    line = replace_repeated_chars(line)
    # expand
    line = expand_sentence(line)
    # remove possessives
    line = remove_possessives(line)
    # remove extra spaces
    line = remove_extra_space(line)
    # tekonize
    line = tokenize_sentence(line)
    # check spelling
    line = check_sentence_spelling(line)
    # remove stopwords
    line = remove_stop_words(line)
    # lemmetization
    line = lemm_sentence(line)
    if len(line) == 0:
        return ['Normal']
    return line

In [None]:
# define the mapping from 's' to 1 and 'u' to 0
forward_label_mapping = {'s': 1, 'u': 0}

# define the reverse mapping from 0 to 'u' and 1 to 's'
reverse_label_mapping = {0: 'u', 1: 's'}

In [None]:
def read_dataset_suicidal_detection(data_path: str) -> Tuple[List[List[List[str]]], List[List[int]]]:
    
    