In [None]:
#import required libraries 
import gensim

from gensim.models import Word2Vec
import gensim.downloader as api
import pandas as pd 
import re
import numpy as np 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#I report here again the tokanization function because we will need them in the cells below
def sent_tokenizer(text):
    return re.findall(r".*?[.!\?]",text)

def word_tokenizer(sentence):
    punct = r"""([A-z])([,;:\?!\."'])"""
    temp_sentence =  re.sub(punct, r"\1 \2", sentence)
    toks = temp_sentence.split()
    temp_out =[]
    # splitting english possessive
    for tok in toks:
        if re.search(r"([A-z]+)’s?$", tok):
            temp_out.extend(re.sub(r"([A-z]+)(’s?)$", r"\1 \2", tok).split())
        else:
            temp_out.append(tok)
    return temp_out 

def my_tokenizer(text):
    import string 
    punct = string.punctuation 
    sentences = sent_tokenizer(text)
    tokenized_text = []
    for sent in sentences:
        if len(sent) > 1 and sent != []:
            tokens = word_tokenizer(sent)
            tokens_1 = [tok for tok in tokens if not tok in punct]
            tokenized_text.append(tokens_1)
    return tokenized_text

In [None]:
#open the two final experimental corpora
depression = pd.read_csv('/content/drive/MyDrive/Computational_Linguistics_Project /datasets/depression_finale.csv', index_col= False)
suicide = pd.read_csv('/content/drive/MyDrive/Computational_Linguistics_Project /datasets/suicidio_finale.csv', index_col= False)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#open the control corpus
corpus_cont = pd.read_csv('/content/drive/MyDrive/Computational_Linguistics_Project /datasets/FINALE_controllo.csv', index_col= False)
print(corpus_cont.shape)

(12059, 4)


In [None]:
# gold standard model to compare with our word embeddings
MEN = [line.split(" ") for line in open("/content/drive/MyDrive/Computational_Linguistics_Project /word_embedding_models/MEN/MEN_dataset_natural_form_full").read().split("\n")[:-1]]
MEN = [(t1,t2,float(sim)) for (t1,t2,sim) in MEN]

In [None]:
#one more passage to ensure that we have strings to process
deprex = [str(ele) for ele in depression['cleaned_text'][:]]
suix = [str(ele) for ele in suicide['cleaned_text'][:]]
not_illness = [str(ele) for ele in corpus_cont['cleaned_text'][:]]

In [None]:
#preparation of data in the form suitable to feed our wordembedding algorithm #DEPRESSION
out_depre = []
for i,article in enumerate(deprex):
    tok_sent = my_tokenizer(article)
    for ele in tok_sent:
        
        out_depre.append(ele)

In [None]:
#preparation of data in the form suitable to feed our wordembedding algorithm #SUICIDE
out_sui = []
for i,article in enumerate(suix):
    tok_sent = my_tokenizer(article)
    for ele in tok_sent:
        
        out_sui.append(ele)
        

In [None]:
#preparation of data in the form suitable to feed our wordembedding algorithm #CONTROLLO
out_controllo = []
for i,article in enumerate(not_illness):
    tok_sent = my_tokenizer(article)
    for ele in tok_sent:
        
        out_controllo.append(ele)

### With the following command I ask to build a word embedding with the Word2Vec algorithm
According to a detailed comparison of Word2Vec and fastText in this notebook, fastText does significantly better on syntactic tasks as compared to the original Word2Vec, especially when the size of the training corpus is small. Word2Vec slightly outperforms fastText on semantic tasks though. The differences grow smaller as the size of the training corpus increases.

In [None]:
#word_embedding D
depressed_model = Word2Vec(sentences= out_depre, sg=1)

In [None]:
#word_embedding S
suicide_model = Word2Vec(sentences = out_sui, sg = 1)

In [None]:
#word_embedding C
control_model = Word2Vec(sentences= out_controllo , sg=1)

In [None]:
#save the models (already_done)
depressed_model.save("depressed.model")
suicide_model.save('suicide.model')
control_model.save('control_model')

In [None]:
def get_comparable_datasets(gold_dataset, dataset2):
    comparable_gold = []
    dataset2_sim = []
    for t1,t2,sim in gold_dataset: 
        try: 
            similarity = dataset2.wv.similarity(t1,t2)
            dataset2_sim.append((t1,t2,similarity))
            comparable_gold.append((t1,t2,sim))
        except KeyError: 
            pass
    sorted_gold = sorted(comparable_gold, key = lambda x : x[2], reverse=False)
    sorted_dataset2 = sorted(dataset2_sim, key = lambda x : x[2], reverse=False)
    return sorted_gold, sorted_dataset2

In [None]:
from scipy.stats import spearmanr

# experimental depression
gold, model = get_comparable_datasets(MEN, depressed_model)
mental = spearmanr(gold, model, axis=None )
print(f"Spearman's correlation between depressed model and MEN gold dataset is:\n\t{mental}")


# experimental suicide
gold, model = get_comparable_datasets(MEN, suicide_model)
mental = spearmanr(gold, model, axis=None )
print(f"Spearman's correlation between suicide model and MEN gold dataset is:\n\t{mental}")


# control
gold2, model2 = get_comparable_datasets(MEN, control_model)
not_mental = spearmanr(gold2, model2, axis=None )
print(f"Spearman's correlation between control model and MEN gold dataset is:\n\t{not_mental}")

Spearman's correlation between depressed model and MEN gold dataset is:
	SpearmanrResult(correlation=0.7742795450832266, pvalue=0.0)
Spearman's correlation between suicide model and MEN gold dataset is:
	SpearmanrResult(correlation=0.7695043710734323, pvalue=0.0)
Spearman's correlation between control model and MEN gold dataset is:
	SpearmanrResult(correlation=0.7723576087829946, pvalue=0.0)


### Here I manually create the clusters and try to evaluate the distance between the centroids both in the experimental and in the control group

In [None]:
#I define the cosine similarity function 
def cosine_similarity(a, b):
    nominator = np.dot(a, b)
    
    a_norm = np.sqrt(np.sum(a**2))
    b_norm = np.sqrt(np.sum(b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

#### Here I want to test the 1st hypothesis of my project: 
In text related to depression, we expect a greater distance between first person singular pronouns and other pronouns rather than in the text coming from other subreddits (control groups)

In [None]:
#depression clusters
v_ME_d = depressed_model.wv['me']
v_I_d = depressed_model.wv['i']
v_SELF_d = depressed_model.wv['myself'] 
v_MY_d = depressed_model.wv['my']
centroid_SELF_d = (v_ME_d + v_I_d +v_SELF_d + v_MY_d)/4


v_OTHER_d = depressed_model.wv['we']
v_ALL_d = depressed_model.wv['all']
v_TOG_d = depressed_model.wv['together']
v_THEY_d = depressed_model.wv['they']
v_YOU_d = depressed_model.wv['you']
v_DO_d = depressed_model.wv['us']
centroid_OTHER_d = (v_OTHER_d+ v_ALL_d+ v_TOG_d + v_THEY_d + v_YOU_d + v_DO_d )/6

In [None]:
#control clusters 
v_ME_c = control_model.wv['me']
v_I_c = control_model.wv['i']
v_SELF_c = control_model.wv['self'] 
v_MY_c = control_model.wv['my']
centroid_SELF_c = (v_ME_c + v_I_c +v_SELF_c + v_MY_c)/4


v_WE_c = control_model.wv['we']
v_ALL_c = control_model.wv['all']
v_TOG_c = control_model.wv['together']
v_THEY_c = control_model.wv['they']
v_YOU_c = control_model.wv['you']
v_US_c = control_model.wv['us']
centroid_OTHER_c = (v_WE_c + v_ALL_c + v_TOG_c + v_THEY_c + v_YOU_c + v_US_c)/6

In [None]:
similarity_depression = cosine_similarity(centroid_SELF_d,centroid_OTHER_d) #should be smaller than the one in the control group <and it's like that>
similarity_depression

0.59665304

In [None]:
similarity_control = cosine_similarity(centroid_SELF_c,centroid_OTHER_c) 
similarity_control

0.66783434

#### Here I want to test the 2nd hypothesis of my project:

In text related to suicidal ideation,we expect a smaller distance between first person singular pronouns and death/negative related works rather than in the text coming from other subreddits (control groups)

In [None]:
#suicide clusters
v_GR_s = suicide_model.wv['grave']
v_DE_s = suicide_model.wv['death']
v_DY_s = suicide_model.wv['dying']
v_END_s = suicide_model.wv['end']
v_LI_s = suicide_model.wv['final']
s = suicide_model.wv['exit']
centroid_DEATH_s = (v_GR_s + v_DE_s +v_DY_s + v_END_s + v_LI_s + s)/6


#v_ME_s = suicide_model.wv['me']
#v_SELF_s = suicide_model.wv['myself']
v_I_s = suicide_model.wv['i']
#v_MY_s = suicide_model.wv['my']
centroid_SELF_s = (v_ME_s + v_SELF_s +v_I_s + v_MY_s)/4

In [None]:
#control clusters 
#centroid_SELF_c taken from above
v_GR_c = control_model.wv['grave']
v_DE_c = control_model.wv['death']
v_DY_c = control_model.wv['dying']
v_END_c = control_model.wv['end']
v_LI_c = control_model.wv['final']
centroid_DEATH_c = (v_GR_c + v_DE_c + v_DY_c + v_END_c + v_LI_c )/5

In [None]:
#suicide: similarity between I and death
cosine_similarity(v_I_s,centroid_DEATH_s)

0.3620802

In [None]:
#control: similarity between I and death
cosine_similarity(v_I_c ,centroid_DEATH_c)

0.43599802

#### I try to use a pretrained model:
I think it's more reliable in testing this second hypothesis. I put into practice something that originally the professor suggested me

In [None]:
import gensim.downloader
from gensim.models import KeyedVectors
# Show all available models in gensim-data
#print(list(gensim.downloader.info()['models'].keys()))

model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Computational_Linguistics_Project /GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
#new control clusters 
v1_c = model['me']
v2_c = model['i']
v3_c = model['myself']
v4_c = model['my']
primo_centroid = (v1_c+ v2_c+ v3_c+ v4_c)/4


v5_c = model['grave']
v6_c = model['death']
v7_c = model['dying']
v8_c = model['end']
v9_c = model['final']
secondo_centroid = (v5_c + v6_c + v7_c+  v8_c + v9_c )/5

In [None]:
#control: similarity between I and death
cosine_similarity(v2_c ,secondo_centroid)

0.07756988