## Importing Relevent Packages

In [1]:
! pip install gensim

Defaulting to user installation because normal site-packages is not writeable
[0m

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score
from nltk.stem.snowball import EnglishStemmer
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
import nltk  # Import NLTK (Natural Language Toolkit) for natural language processing tasks

  "class": algorithms.Blowfish,


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/unicconaiadmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading the training and testing dataset

In [4]:
data_a = pd.read_csv("Suicide_Ideation_Dataset(Twitter-based).csv")

In [5]:
data_b = data_a.copy()

In [6]:
data_a.drop_duplicates(inplace=True)
data_b.drop_duplicates(inplace=True)

In [7]:
data_a.isna().sum()
data_b.isna().sum()

Tweet      2
Suicide    0
dtype: int64

In [8]:
data_a.dropna(inplace=True)
data_b.dropna(inplace=True)

In [9]:
data_a.shape

(1777, 2)

In [10]:
data_b.shape

(1777, 2)

## Preprocessing Data A

In [11]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model onto the GPU
tokenizer = AutoTokenizer.from_pretrained("grammarly/coedit-large")
model = T5ForConditionalGeneration.from_pretrained("grammarly/coedit-large").to(device)

In [12]:
def preprocess_text(input_text):
    # Tokenize input text and move tensors to the GPU
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    # Generate outputs on the GPU
    outputs = model.generate(input_ids, max_length=256)

    # Decode the output and move it back to CPU for further processing if needed
    edited_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return edited_text

In [13]:
data_a['grammatically_corrected_text'] = data_a['Tweet'].apply(preprocess_text)

In [14]:
def preprocess_text_2(text: str):
    
    # Instantiating our lemmatizer
    lemma = EnglishStemmer()
    # Remove URLs
    text = ' '.join(word for word in text.split() if not word.startswith('http'))
    text = ' '.join(word for word in text.split() if not word.startswith('www'))
    
    # Remove special characters and punctuation
    text = ''.join(char for char in text if char.isalnum() or char.isspace())

    # Remove mentions (@username)
    text = ' '.join(word for word in text.split() if not word.startswith('@'))

    # Remove hashtags (#technology)
    text = ' '.join(word[1:] if word.startswith('#') else word for word in text.split())

    # Removing stopwords
    ## NB: Remember to convert the text into thier lowercase form so that for example "I" will be exactly the same as "i"
    text = remove_stopwords(text.lower())

    # Tokenization
    text = word_tokenize(text)

    #lemmatization
    text = ' '.join([lemma.stem(word) for word in text])
    
    return text

In [15]:
data_a['final_text_format'] = data_a['grammatically_corrected_text'].apply(preprocess_text_2)

In [16]:
data_a.head()

Unnamed: 0,Tweet,Suicide,grammatically_corrected_text,final_text_format
0,making some lunch,Not Suicide post,I am making lunch.,make lunch
1,@Alexia You want his money.,Not Suicide post,You want his money?,want money
2,@dizzyhrvy that crap took me forever to put to...,Potential Suicide post,That was a long time ago to put it all togethe...,long time ago sleep day
3,@jnaylor #kiwitweets Hey Jer! Since when did y...,Not Suicide post,"Hey Jer, when did you start tweeting?",hey jer start tweet
4,Trying out &quot;Delicious Library 2&quot; wit...,Not Suicide post,Trying out &quot;Delicious Library 2&quot; wit...,tri quotdelici librari 2quot mix result bar co...


In [17]:
data_a['Suicide'].value_counts()

Suicide
Not Suicide post           1124
Potential Suicide post      653
Name: count, dtype: int64

In [22]:
data_a['Suicide'] = data_a['Suicide'].map(
    {'Not Suicide post': 0,
    'Potential Suicide post ': 1 }
)

## Preparing second dataset

In [24]:
data_b['final_text_format'] = data_b['Tweet'].apply(preprocess_text_2)

In [25]:
data_b['Suicide'] = data_b['Suicide'].map(
    {'Not Suicide post': 0,
    'Potential Suicide post ': 1 }
)

## Creating train and text data and X and Y.

In [26]:
X_a = data_a["final_text_format"].values
y_a = data_a["Suicide"].values

### Creating train and test data

In [30]:
X_a_train, X_a_test, y_a_train, y_a_test = train_test_split(X_a, y_a, train_size=0.7, stratify=y_a)

In [31]:
vectorizer = TfidfVectorizer(max_features=5000)

In [32]:
X_a_train_vec = vectorizer.fit_transform(X_a_train)
X_a_test_vec = vectorizer.transform(X_a_test)

### Model Training

In [33]:
model_a = GaussianNB()

In [35]:
model_a.fit(X_a_train_vec.toarray(), y_a_train)

### Model Evaluation

In [37]:
y_a_pred = model_a.predict(X_a_test_vec.toarray())

In [38]:
acc = accuracy_score(y_pred=y_a_pred, y_true=y_a_test)
print(acc)

0.5917602996254682


In [39]:
pre_a = precision_score(y_pred=y_a_pred, y_true=y_a_test)
print(pre_a)

0.4691011235955056


### Model training B

## Creating train and text data and X and Y.

In [40]:
X_b = data_b["final_text_format"].values
y_b = data_b["Suicide"].values

### Creating train and test data

In [41]:
X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(X_b, y_b, train_size=0.7, stratify=y_b)

In [42]:
vectorizer_b = TfidfVectorizer(max_features=5000)

In [43]:
X_b_train_vec = vectorizer_b.fit_transform(X_b_train)
X_b_test_vec = vectorizer_b.transform(X_b_test)

### Model Training

In [44]:
model_b = GaussianNB()

In [45]:
model_b.fit(X_b_train_vec.toarray(), y_b_train)

### Model Evaluation

In [46]:
y_b_pred = model_b.predict(X_b_test_vec.toarray())

In [47]:
acc_b = accuracy_score(y_pred=y_b_pred, y_true=y_b_test)
print(acc_b)

0.6142322097378277


In [48]:
pre_b = precision_score(y_pred=y_b_pred, y_true=y_b_test)
print(pre_b)

0.4857142857142857


In [49]:
result = pd.DataFrame(
    {
        "Accuracy" : [acc, acc_b],
        "Precision": [pre_a, pre_b] 
    }
)

In [50]:
result.index = ["Method A", "Method B"]

In [51]:
result

Unnamed: 0,Accuracy,Precision
Method A,0.59176,0.469101
Method B,0.614232,0.485714


In [53]:
print(result)

          Accuracy  Precision
Method A  0.591760   0.469101
Method B  0.614232   0.485714
