# Suicidal Prediction Dataset Pre Process



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import  AdamW
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import pandas as pd
import numpy as np
import spacy
import re
import math
import gc
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from transformers import BertTokenizer
import ast
from torch.nn.utils.rnn import pad_sequence
import os
import matplotlib.pyplot as plt

## CLasses 
####  
---


In [9]:
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
def preprocess_text(txt:str):
    """
    Preprocesses the input text by removing non-alphabetic characters, converting to lowercase,
    removing stopwords and punctuation, and lemmatizing the remaining tokens.

    Args:
        txt (str): The input text to be preprocessed.

    Returns:
        str: The preprocessed text.
    """

    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt = txt.lower()
    txt = " ".join(txt.split())

    doc = nlp(txt)

    tokens_filtered = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue

        tokens_filtered.append(token.lemma_)

    return " ".join(tokens_filtered)


def preprocess_labels(data):
    """
    Preprocesses the labels in the given data.

    Args:
        data (pandas.DataFrame): The input data containing the 'label' column.

    Returns:
        pandas.DataFrame: The input data with an additional 'label_prep' column, where the labels are mapped to unique IDs.
    """
    LABELS = data['label'].unique()
    label2id = dict(zip(LABELS, np.arange(len(LABELS), dtype=np.float32)))
    data['label_prep'] = data['label'].map(label2id)
    return data


def tokenize_and_convert(text):
    # Tokenize the text using spaCy
    spaCy_tokens = [token.text for token in nlp(text)]
    
    # Convert spaCy tokens to strings
    token_strings = [str(token) for token in spaCy_tokens]
    
    # Map token strings to numerical IDs using the pre-trained tokenizer
    token_ids = tokenizer.convert_tokens_to_ids(token_strings)
    
    return token_ids


### Raw Data
####  
---


In [7]:
# Load the CSV file
data = pd.read_csv('../data/mental-health.csv')
# Display the first few rows of the DataFrame
print(data.head())
data = data.drop_duplicates(ignore_index = True)
df_null_values = data.isnull().sum().to_frame().rename(columns = {0:'count'})
df_null_values['%'] = (df_null_values['count'] / len(data)) * 100
print("")
print("df_null_values")
df_null_values



                                                text         label
0  I recently went through a breakup and she said...    depression
1  I do not know how to navigate these feelings, ...    depression
2  So I have been with my bf for 5 months , and h...    depression
3  I am so exhausted of this. Just when I think I...  SuicideWatch
4  I have been severly bullied since i was 5 till...    depression

df_null_values


Unnamed: 0,count,%
text,0,0.0
label,0,0.0


### Process Data
####  
---


In [8]:
# apply the preprocess_text function to the 'text' column
data['text_prep'] = data['text'].apply(preprocess_text)


In [6]:
#apply the tokenize_and_convert function to the 'text_prep' column
data['token_id'] =  data['text_prep'].apply(tokenize_and_convert)

In [11]:
preprocessed_data = preprocess_labels(data)
print(preprocessed_data.head())


                                                text         label  \
0  I recently went through a breakup and she said...    depression   
1  I do not know how to navigate these feelings, ...    depression   
2  So I have been with my bf for 5 months , and h...    depression   
3  I am so exhausted of this. Just when I think I...  SuicideWatch   
4  I have been severly bullied since i was 5 till...    depression   

                                           text_prep  label_prep  
0  recently go breakup say want friend say try ta...         0.0  
1  know navigate feeling new feeling stretch unde...         0.0  
2  bf month tell depressed week particular happen...         0.0  
3  exhausted think finally rest think maybe thing...         1.0  
4  severly bully till result depressed misanthrop...         0.0  


In [12]:
# Drop rows with null values in the 'text_prep' column.
data.dropna(subset=['text_prep'], inplace=True)

In [9]:
data.to_csv('../data/preprocessed_data.csv', index=False)

### Final Results
####  
---


In [16]:
data.head()

Unnamed: 0,text,label,text_prep,token_id,label_prep
0,I recently went through a breakup and she said...,depression,recently go breakup say want friend say try ta...,"[3728, 2175, 19010, 2360, 2215, 2767, 2360, 30...",0.0
1,"I do not know how to navigate these feelings, ...",depression,know navigate feeling new feeling stretch unde...,"[2113, 22149, 3110, 2047, 3110, 7683, 3305, 27...",0.0
2,"So I have been with my bf for 5 months , and h...",depression,bf month tell depressed week particular happen...,"[28939, 3204, 2425, 14777, 2733, 3327, 4148, 2...",0.0
3,I am so exhausted of this. Just when I think I...,SuicideWatch,exhausted think finally rest think maybe thing...,"[9069, 2228, 2633, 2717, 2228, 2672, 2518, 270...",1.0
4,I have been severly bullied since i was 5 till...,depression,severly bully till result depressed misanthrop...,"[100, 20716, 6229, 2765, 14777, 100, 100, 3674...",0.0
