In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from time import time
from collections import Counter

In [None]:
try:
    os.chdir("/content/drive/MyDrive/Colab Notebooks")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


In [None]:
EPOCHS = 10
#SEED = 4222

In [None]:
suicide_detection_df = pd.read_csv('Data/suicide_detection_final_cleaned.csv', header=0)
suicide_detection_df.reset_index(drop=True, inplace=True)
suicide_detection_df.replace({"class": {"suicide": 1, "non-suicide": 0}}, inplace=True)
suicide_detection_df.drop(columns=['text'], inplace=True)
suicide_detection_df = suicide_detection_df.rename(columns={"cleaned_text": "text"})

In [None]:
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['class'],

                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['class'])


In [None]:
# define vocab
vocab = Counter()
# tokenise each sentence
tokens_list = [(s.split()) for s in train_text]
# add each sentence to vocab
for i in tokens_list:
  vocab.update(i)
# removing words with a low occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]

In [None]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

# save tokens to a vocabulary file
save_list(vocab, 'Data/vocab.txt')

In [None]:
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'Data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [None]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

In [None]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)

In [None]:
# set up the parameters of the model
model = Word2Vec(vector_size=300, window=10, min_count=1, epochs=EPOCHS)


# it builds the vocabulary from a sequence of sentences and thus initialized the model.
t = time()
model.build_vocab(train_clean, progress_per=1000)
# training the model
t = time()
model.train(train_clean, total_examples=model.corpus_count, epochs=EPOCHS, report_delay=1)


(1022116, 1253510)

In [None]:
filename = 'Data/embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [None]:
model.wv.most_similar('suicide')

[('commit', 0.9702114462852478),
 ('attempt', 0.9699046611785889),
 ('thought', 0.9639476537704468),
 ('plan', 0.9514110684394836),
 ('tendency', 0.9481505751609802),
 ('suicidal', 0.9464195966720581),
 ('depression', 0.9458267092704773),
 ('relief', 0.945617139339447),
 ('think', 0.9445353150367737),
 ('hospital', 0.9420954585075378)]