In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [18]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score, f1_score, precision_score
from collections import Counter
from prettytable import PrettyTable

In [3]:
SEED = 2
EPOCHS = 5

In [4]:
try:
    os.chdir("/content/drive/MyDrive/Colab Notebooks")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


In [6]:
suicide_detection_df = pd.read_csv('Data/suicide_detection_final_cleaned.csv', header=0)
suicide_detection_df.drop(columns=['text'], axis=1, inplace=True)
suicide_detection_df = suicide_detection_df.rename(columns={"cleaned_text": "text"})
classes = {"suicide": 1, "non-suicide": 0}
suicide_detection_df = suicide_detection_df.replace({"class": classes})
suicide_detection_df.head()

Unnamed: 0,class,text
0,1,sex wife threaten suicide recently leave wife ...
1,0,weird not affect compliment come know girl fee...
2,0,finally hear bad year swear fucking god annoying
3,1,need help help cry hard
4,1,end tonight not anymore quit


In [7]:
random.seed(SEED)
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['class'],

                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['class'])

In [8]:
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'Data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [9]:
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
	return embedding

In [10]:
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

In [11]:
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

In [12]:
def document_vector(doc, embeddings):
    sentence = list()
    doc = [word for word in doc if word in embeddings.keys()]
    for i in doc:
      word = embeddings[i]
      sentence.append(word)
    return np.mean(sentence, axis=0)

In [13]:
def all_documents(df, labels_ori, embeddings):
  vec = list()
  labels = list()
  for i in range(len(df)):
    if len(df[i]) == 0:
      continue
    else:
      vec.append(document_vector(df[i], embeddings))
      labels.append(labels_ori.values[i])
  return vec, labels

In [14]:
word2vec = load_embedding('Data/embedding_word2vec.txt')

In [15]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)
train_vec, train_labels_new = all_documents(train_clean, train_labels,word2vec)
test_vec, test_labels_new = all_documents(test_clean, test_labels, word2vec)

In [16]:
lr = LogisticRegression()
lr.fit(train_vec, train_labels_new)

In [19]:
y_train_pred = lr.predict(train_vec)
print('Training set accuracy %s' % accuracy_score(train_labels_new, y_train_pred))
print(classification_report(train_labels_new, y_train_pred))


Training set accuracy 0.864891846921797
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3728
           1       0.83      0.81      0.82      2282

    accuracy                           0.86      6010
   macro avg       0.86      0.85      0.86      6010
weighted avg       0.86      0.86      0.86      6010



In [21]:
y_test_pred = lr.predict(test_vec)
print('Test set accuracy %s' % accuracy_score(test_labels_new, y_test_pred))
print(classification_report(test_labels_new, y_test_pred))

Test set accuracy 0.8516300731869594
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       932
           1       0.83      0.77      0.80       571

    accuracy                           0.85      1503
   macro avg       0.85      0.83      0.84      1503
weighted avg       0.85      0.85      0.85      1503



In [22]:
word2vec_test_accuracy_score = accuracy_score(test_labels_new, y_test_pred)
word2vec_test_precision_score = precision_score(test_labels_new, y_test_pred)
word2vec_test_recall_score = recall_score(test_labels_new, y_test_pred)
word2vec_test_f1_score = f1_score(test_labels_new, y_test_pred)

In [23]:
table = PrettyTable()
table.field_names = ['Model - Logistic Regression', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

table.add_row(['Word2Vec',
               format(word2vec_test_accuracy_score, '.4f'),
               format(word2vec_test_precision_score, '.4f'),
               format(word2vec_test_recall_score, '.4f'),
               format(word2vec_test_f1_score, '.4f')])

print(table)

+-----------------------------+----------+-----------+--------+----------+
| Model - Logistic Regression | Accuracy | Precision | Recall | F1 Score |
+-----------------------------+----------+-----------+--------+----------+
|           Word2Vec          |  0.8516  |   0.8308  | 0.7653 |  0.7967  |
+-----------------------------+----------+-----------+--------+----------+
