# Doc2vec Analysis of Submissions
#### This notebook extracts the DBOW and DM Doc2Vec embeddings for submissions and subreddits and finds SuicideWatch's nearest neighbors in both models

## *Setup*

### Import dependencies

In [1]:
import gensim
import numpy as np
import pandas as pd
import collections
from operator import itemgetter
from scipy.spatial.distance import cosine as cosine_distance
import time
import datetime
import gc

### Import data

In [2]:
datasource = "/media/seagate0/reddit/samples/"
filesource = "sample_main_50elsm_data_prep.csv"
filepath = datasource+filesource

In [3]:
%%time
data = pd.read_csv(filepath)
data.dropna(inplace=True)
data.shape

CPU times: user 15.2 s, sys: 1 s, total: 16.2 s
Wall time: 16.2 s


In [None]:
data.head()

### Prepare data for embedding (skip below if already trained/saved)

In [None]:
# Print example of tokenized submission
print(data.iloc[0]["text_prep"].split()[:20])
print(data.iloc[0]["submission_id"])
print(data.iloc[0]["subreddit"])

In [6]:
# Define function to convert text and tags into a doc2vec TaggedDocument object
def read_corpus(data=data, tokens_only=False):
    print("Processing data into doc2vec TaggedDocument object...")
    print('  Timestamp: {}'.format(datetime.datetime.time(datetime.datetime.now())))
    time_start = round(time.time())
    data_len = data.shape[0]
    for i,line in data.iterrows():
        if i == 1000:
            time_check = round(time.time()-time_start)
            print("  {}s: Processing line {} of {}...".format(time_check,i,data_len))
        if i == 10000:
            time_check = round(time.time()-time_start)
            print("  {}s: Processing line {} of {}...".format(time_check,i,data_len))
        if i%100000 == 0:
            time_check = round(time.time()-time_start)
            print("  {}s: Processing line {} of {}...".format(time_check,i,data_len))
        try:
            tokens = line["text_prep"].split()
        except:
            print("  Error at line {}:\n{}".format(i,line))
        if tokens_only:
            try:
                yield tokens
            except:
                print("  Error at line {}:\n{}".format(i,line))
        else:
            # For training data, add tags
            try:
                subm_id = line["submission_id"]
                subr_id = line["subreddit"]
                yield gensim.models.doc2vec.TaggedDocument(tokens, [subm_id,subr_id])
            except:
                print("  Error at line {}:\n{}".format(i,line))
    print("Finished processing data into doc2vec TaggedDocument object")
    print('  Timestamp: {}'.format(datetime.datetime.time(datetime.datetime.now())))

In [9]:
%%time
# Convert text and tags into train and test doc2vec TaggedDocument objects
corpus_train = list(read_corpus(data, tokens_only=False))
#corpus_test = list(read_corpus(data[:100], tokens_only=True))

Processing data into doc2vec TaggedDocument object...
  Timestamp: 14:22:04.354651
  1s: Processing line 0 of 1200579...
  1s: Processing line 1000 of 1200579...
  2s: Processing line 10000 of 1200579...
  11s: Processing line 100000 of 1200579...
  23s: Processing line 200000 of 1200579...
  33s: Processing line 300000 of 1200579...
  44s: Processing line 400000 of 1200579...
  56s: Processing line 500000 of 1200579...
  68s: Processing line 600000 of 1200579...
  81s: Processing line 700000 of 1200579...
  91s: Processing line 800000 of 1200579...
  105s: Processing line 900000 of 1200579...
  114s: Processing line 1000000 of 1200579...
  129s: Processing line 1100000 of 1200579...
  138s: Processing line 1200000 of 1200579...
Finished processing data into doc2vec TaggedDocument object
  Timestamp: 14:24:22.490976
CPU times: user 2min 12s, sys: 5.91 s, total: 2min 18s
Wall time: 2min 18s


In [None]:
# Inspect training TaggedDocument object
corpus_train[0]

In [None]:
# Inspect testing TaggedDocument object
print(corpus_test[:2])

## *Training*

### Instantiate doc2vec object and vocabulary -- DMM

In [11]:
model_dmm = gensim.models.doc2vec.Doc2Vec(
    dm=1,
    dm_concat=0,
    dm_mean=1,
    vector_size=50,
    window=10,
    min_count=2,
    negative=5,
    epochs=40,
    workers=32
)
print(model_dmm)

Doc2Vec(dm/m,d50,n5,w10,mc2,s0.001,t32)


In [12]:
%%time
model_dmm.build_vocab(corpus_train)

CPU times: user 1min 44s, sys: 353 ms, total: 1min 44s
Wall time: 1min 44s


### Train the doc2vec model

In [13]:
%%time
model_dmm.train(corpus_train, total_examples=model_dmm.corpus_count, epochs=model_dmm.epochs)

CPU times: user 15h 40min 54s, sys: 23min 49s, total: 16h 4min 44s
Wall time: 4h 47min 10s


### Infer a vector

In [71]:
test_doc = "I do n't care about my family anymore . I know they love me . I know they 'll suffer a lot if i kill myself , but i just dont care anymore .".lower().split()
test_doc_dmm = model_dmm.infer_vector(test_doc, steps=200, alpha=0.025)
print("shape:", test_doc_dmm.shape)
test_doc_dmm

shape: (50,)


array([-0.16079564,  0.07370246, -0.12908716, -0.27815023,  0.09495931,
       -0.05440789, -0.05641905,  0.2553274 , -0.10173523,  0.16145292,
       -0.05058431,  0.12746558,  0.1476429 , -0.19527781, -0.00107579,
       -0.02528723,  0.08655364, -0.01832677, -0.08826234, -0.13473023,
        0.00883819, -0.10091036, -0.19562408,  0.35356927,  0.19241996,
       -0.13329263, -0.16932149,  0.05248863,  0.01157821,  0.11397031,
       -0.12884852,  0.11231311, -0.08106791,  0.0162975 , -0.02429766,
       -0.1850096 , -0.09072895,  0.02642971, -0.12796813, -0.20217827,
       -0.01417279, -0.02537065,  0.01768991, -0.05502219, -0.15525669,
        0.08895854, -0.02833021, -0.17687939,  0.20146425, -0.01183373],
      dtype=float32)

### Save model

In [15]:
model_dmm.save("/media/seagate0/reddit/models/doc2vec_dmm50_sample_main_50elsm.model")
print("Saved model")

Saved model


### Instantiate doc2vec object and vocabulary -- DBOW

In [16]:
model_dbow = gensim.models.doc2vec.Doc2Vec(
    dm=0,
    dbow_words=0,  #do not train word vectors
    dm_concat=0,
    dm_mean=1,
    vector_size=50,
    window=10,
    min_count=2,
    negative=5,
    epochs=40,
    workers=32
)
print(model_dbow)

Doc2Vec(dbow,d50,n5,mc2,s0.001,t32)


In [17]:
%%time
model_dbow.build_vocab(corpus_train)

CPU times: user 1min 31s, sys: 213 ms, total: 1min 31s
Wall time: 1min 31s


### Train the doc2vec model

In [18]:
%%time
model_dbow.train(corpus_train, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

CPU times: user 16h 56min 43s, sys: 6min 38s, total: 17h 3min 21s
Wall time: 3h 17min 17s


### Infer a vector

In [70]:
test_doc_dbow = model_dbow.infer_vector(test_doc, steps=200, alpha=0.025)
print("shape:", test_doc_dbow.shape)
test_doc_dbow

shape: (50,)


array([-0.12393504,  0.08245785, -0.20959017,  0.09498589,  0.04521151,
       -0.37901607,  0.18479787,  0.08828386, -0.06929699, -0.16781302,
        0.04701287, -0.05489627,  0.2943915 ,  0.10640723,  0.24301195,
       -0.30618152, -0.01532197, -0.1786422 , -0.02829616,  0.05098295,
        0.24090727, -0.38026774, -0.18207112, -0.0344217 , -0.19350582,
        0.14370978, -0.04678387,  0.17149614, -0.00754963,  0.23412168,
       -0.08593183, -0.16377133,  0.04720971,  0.08139323, -0.00694396,
       -0.33116597,  0.20933862,  0.07564811, -0.13462025, -0.00746284,
       -0.05642885, -0.22911847, -0.1206275 ,  0.24879058,  0.13675421,
       -0.11179908,  0.28212434,  0.06215185, -0.14156468,  0.03243285],
      dtype=float32)

### Save model

In [20]:
model_dbow.save("/media/seagate0/reddit/models/doc2vec_dbow50_sample_main_50elsm.model")
print("Saved model")

Saved model


## *Evaluating*

### Load models (if already saved)

In [27]:
model_dmm = gensim.models.doc2vec.Doc2Vec.load("/media/seagate0/reddit/models/doc2vec_dmm50_sample_main_50elsm.model")
print("Loaded model:", model_dmm)

Loaded model: Doc2Vec(dm/m,d50,n5,w10,mc2,s0.001,t32)


In [5]:
model_dbow = gensim.models.doc2vec.Doc2Vec.load("/media/seagate0/reddit/models/doc2vec_dbow50_sample_main_50elsm.model")
print("Loaded model:", model_dbow)

Loaded model: Doc2Vec(dbow,d50,n5,mc2,s0.001,t32)


### Combine DMM & DBOW (only need for prediction tasks)

In [21]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
model_combo = ConcatenatedDoc2Vec([model_dmm, model_dbow])

In [72]:
test_doc_combo = model_combo.infer_vector(test_doc, steps=200, alpha=0.025)
print("shape:", test_doc_combo.shape)
test_doc_combo

shape: (100,)


array([-0.17788772,  0.06824362, -0.14447175, -0.37007782,  0.0455959 ,
       -0.08171896, -0.09022342,  0.29001012, -0.1611098 ,  0.13808334,
       -0.00350697,  0.08476301,  0.16170874, -0.16469066,  0.0363806 ,
       -0.05345469,  0.05140643, -0.05005645, -0.14914726, -0.12469329,
        0.07484219, -0.11463426, -0.13967685,  0.36009094,  0.05566248,
       -0.13267234, -0.17218304,  0.07263025, -0.0202819 ,  0.15231906,
       -0.17784648,  0.02100503, -0.18189642,  0.07895544, -0.06447177,
       -0.18338837, -0.08714453,  0.03098639, -0.13841623, -0.17454165,
        0.070465  ,  0.01505003,  0.05237506, -0.02842668, -0.19627875,
        0.12319263,  0.05189837, -0.1270181 ,  0.21594492, -0.07609142,
       -0.11423831,  0.09388574, -0.20183271,  0.17691752,  0.03234727,
       -0.3304911 ,  0.22366333,  0.08581357, -0.05267319, -0.10281381,
       -0.06877119, -0.05021801,  0.3093455 ,  0.09015313,  0.21809936,
       -0.25132445, -0.06733897, -0.14863732,  0.05652968,  0.07

### Evaluate models

In [74]:
# Compare the similarities of test_doc between models
print("DMM vs DBOW:", cosine_distance(test_doc_dmm, test_doc_dbow))

DMM vs DBOW: 0.9307567030191422


In [75]:
%%time
# Compare how often inferred vectors for docs are in the most similar learned vectors
#    See github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
for model in [model_dmm, model_dbow]:
    print("Evaluating", str(model))
    ranks = []
    second_ranks = []
    for doc_id in range(100):
        if doc_id % 20 == 0:
            print("  Evaluating doc_id", doc_id)
        doc_tag = corpus_train[doc_id].tags[0]
        iv = model.infer_vector(corpus_train[doc_id].words, steps=200, alpha=0.025)
        sims = model.docvecs.most_similar([iv], topn=len(model.docvecs))
        rank = [docid for docid, sim in sims].index(doc_tag)
        ranks.append(rank)
        second_ranks.append(sims[1])
    print(str(model), "-->", sorted(collections.Counter(ranks).items(), key=itemgetter(0)))

Evaluating Doc2Vec(dm/m,d50,n5,w10,mc2,s0.001,t32)
  Evaluating doc_id 0
  Evaluating doc_id 20
  Evaluating doc_id 40
  Evaluating doc_id 60
  Evaluating doc_id 80
Doc2Vec(dm/m,d50,n5,w10,mc2,s0.001,t32) --> [(0, 2), (11, 1), (31, 1), (480, 1), (613, 1), (898, 1), (1188, 1), (1397, 1), (4688, 1), (5173, 1), (5815, 1), (5899, 1), (7145, 1), (8596, 1), (13098, 1), (18784, 1), (20792, 1), (22351, 1), (22705, 1), (26195, 1), (26399, 1), (28464, 1), (30144, 1), (31574, 1), (34297, 1), (35001, 1), (35776, 1), (38311, 1), (39520, 1), (39814, 1), (42678, 1), (45249, 1), (47348, 1), (48991, 1), (50274, 1), (51826, 1), (54649, 1), (55083, 1), (56472, 1), (56948, 1), (57073, 1), (58252, 1), (59496, 1), (61326, 1), (62491, 1), (64481, 1), (65212, 1), (67194, 1), (71521, 1), (72853, 1), (75917, 1), (77070, 1), (77294, 1), (77589, 1), (77983, 1), (79765, 1), (80490, 1), (82820, 1), (82946, 1), (82973, 1), (85065, 1), (85235, 1), (86527, 1), (87289, 1), (87511, 1), (88470, 1), (90641, 1), (91363, 1)

In [32]:
# Print the 20 most similar dmm doctags to SuicideWatch
model_dmm.docvecs.most_similar(positive=['SuicideWatch'], topn=20)

[('depression', 0.9886905550956726),
 ('depressed', 0.9801347255706787),
 ('depression_help', 0.9759532809257507),
 ('Suicide_help', 0.9659072756767273),
 ('getting_over_it', 0.9646139144897461),
 ('Prevent_Suicide', 0.9627123475074768),
 ('mentalhealth', 0.9618516564369202),
 ('sad', 0.9572824835777283),
 ('MMFB', 0.9571409225463867),
 ('SanctionedSuicide', 0.949762225151062),
 ('venting', 0.9487173557281494),
 ('suicidenotes', 0.9485184550285339),
 ('mentalillness', 0.945374608039856),
 ('ptsd', 0.9435155987739563),
 ('BPD', 0.9409050941467285),
 ('MyLittleSupportGroup', 0.9379068613052368),
 ('suicidalthoughts', 0.935742974281311),
 ('selfhelp', 0.9357046484947205),
 ('whatsbotheringyou', 0.9351156949996948),
 ('MomForAMinute', 0.9350012540817261)]

In [6]:
# Print the 20 most similar dbow doctags to SuicideWatch
model_dbow.docvecs.most_similar(positive=['SuicideWatch'], topn=20)

[('depression', 0.9595265984535217),
 ('MMFB', 0.9271743297576904),
 ('depression_help', 0.9236374497413635),
 ('whatsbotheringyou', 0.9202948212623596),
 ('depressed', 0.9156358242034912),
 ('Suicide_help', 0.9130778312683105),
 ('sad', 0.9110236763954163),
 ('suicidenotes', 0.9109750986099243),
 ('offmychest', 0.9064748883247375),
 ('SanctionedSuicide', 0.9048740267753601),
 ('getting_over_it', 0.9028440713882446),
 ('venting', 0.8982775807380676),
 ('mentalhealth', 0.8886355757713318),
 ('selfhelp', 0.8877691626548767),
 ('Vent', 0.8870785236358643),
 ('MomForAMinute', 0.8847565054893494),
 ('helpmecope', 0.8794957995414734),
 ('mentalillness', 0.876664936542511),
 ('MyLittleSupportGroup', 0.8710669875144958),
 ('needadvice', 0.8694227337837219)]

In [None]:
# Compare similarity between SuicideWatch and first submission
print("1st subm (non-SW):", str(data[data["submission_id"] == "10009b"]["text_prep"].values)[1:100], "...")
print("  DMM sim between 1st subm and SW: ", model_dmm.docvecs.similarity("SuicideWatch", "10009b"))
print("  DBOW sim between 1st subm and SW:", model_dbow.docvecs.similarity("SuicideWatch", "10009b"))

In [None]:
sw_subms = data[data["subreddit"] == "SuicideWatch"][:10]["submission_id"].values
print("First 10 SuicideWatch submission ids:", " ".join(list(sw_subms)))
for subm_idx in sw_subms:
    print("--------------------")
    print(
        subm_idx,
        str(data[data["submission_id"] == subm_idx]["text_prep"].values)[:250]
    )

In [79]:
# Compare SW submissions to SW with learned PV
print("DMM evals...")
for idx in sw_subms:
    print("Similarity between {} and SW: {}".format(idx, model_dmm.docvecs.similarity("SuicideWatch", idx)))
    print("              ... vs. depression: {}".format(model_dmm.docvecs.similarity("depression", idx)))
    print("                ... vs. selfhelp: {}".format(model_dmm.docvecs.similarity("selfhelp", idx)))
print("---------")
print("DBOW evals...")
for idx in sw_subms:
    print("Similarity between {} and SW: {}".format(idx, model_dbow.docvecs.similarity("SuicideWatch", idx)))
    print("              ... vs. depression: {}".format(model_dbow.docvecs.similarity("depression", idx)))
    print("                ... vs. selfhelp: {}".format(model_dbow.docvecs.similarity("selfhelp", idx)))

DMM evals...
Similarity between 100wft and SW: 0.19290207641774065
              ... vs. depression: 0.20347020772973845
                ... vs. selfhelp: 0.2778982650424426
Similarity between 101z2w and SW: -0.09868440731064658
              ... vs. depression: -0.1271077981994744
                ... vs. selfhelp: -0.10599970355361828
Similarity between 102igs and SW: -0.02867933409854646
              ... vs. depression: -0.06243201521785505
                ... vs. selfhelp: -0.008436968226964738
Similarity between 10314t and SW: 0.35937798778025076
              ... vs. depression: 0.3191908527418925
                ... vs. selfhelp: 0.28700990471981713
Similarity between 1032pn and SW: -0.14940028491681395
              ... vs. depression: -0.1761693342759036
                ... vs. selfhelp: -0.24748740416381787
Similarity between 103k4o and SW: 0.11575966796116124
              ... vs. depression: 0.15229752182586315
                ... vs. selfhelp: 0.1992856629211117
Similarity

In [84]:
# Compare SW submissions to SW using inferred vectors
sw_pv_dmm = model_dmm.docvecs["SuicideWatch"]
sw_pv_dbow = model_dbow.docvecs["SuicideWatch"]
for idx in sw_subms:
    subm_iv_text = str(data[data["submission_id"] == idx]["text_prep"].values)[2:-2].split()
    subm_iv = model_dmm.infer_vector(subm_iv_text, steps=200, alpha=0.025)
    print("DMM  similarity between {} and SW: {}".format(idx, 1-cosine_distance(subm_iv, sw_pv_dmm)))
    subm_iv = model_dbow.infer_vector(subm_iv_text, steps=200, alpha=0.025)
    print("DBOW similarity between {} and SW: {}".format(idx, 1-cosine_distance(subm_iv, sw_pv_dbow)))

DMM  similarity between 100wft and SW: 0.5994322299957275
DBOW similarity between 100wft and SW: 0.775780975818634
DMM  similarity between 101z2w and SW: 0.5643561482429504
DBOW similarity between 101z2w and SW: 0.7402441501617432
DMM  similarity between 102igs and SW: 0.48693186044692993
DBOW similarity between 102igs and SW: 0.7305420637130737
DMM  similarity between 10314t and SW: 0.4699523448944092
DBOW similarity between 10314t and SW: 0.8405201435089111
DMM  similarity between 1032pn and SW: 0.433451384305954
DBOW similarity between 1032pn and SW: 0.7198936939239502
DMM  similarity between 103k4o and SW: 0.44383272528648376
DBOW similarity between 103k4o and SW: 0.813504695892334
DMM  similarity between 103ob2 and SW: 0.3849857449531555
DBOW similarity between 103ob2 and SW: 0.7963464260101318
DMM  similarity between 1053t3 and SW: 0.5498508810997009
DBOW similarity between 1053t3 and SW: 0.7747808694839478
DMM  similarity between 105zh8 and SW: 0.4453285336494446
DBOW similarity

In [87]:
# Determine how similar gensim and scipy cosine distances are
for model in [model_dmm, model_dbow]:
    sw_1032pn = str(data[data["submission_id"] == "1032pn"]["text_prep"].values)[2:-2].split()
    sw_1032pn = model.infer_vector(sw_1032pn, steps=200, alpha=0.025)
    sw_103ob2 = str(data[data["submission_id"] == "103ob2"]["text_prep"].values)[2:-2].split()
    sw_103ob2 = model.infer_vector(sw_103ob2, steps=200, alpha=0.025)
    print("Comparing {}:".format(str(model)))
    print("  Gensim cosine distance estimate:", model.docvecs.similarity("1032pn", "103ob2"))
    print("  Manual cosine distance estimate:", 1-cosine_distance(sw_1032pn, sw_103ob2))

Comparing Doc2Vec(dm/m,d50,n5,w10,mc2,s0.001,t32):
  Gensim cosine distance estimate: 0.3848767234303422
  Manual cosine distance estimate: 0.9439608454704285
Comparing Doc2Vec(dbow,d50,n5,mc2,s0.001,t32):
  Gensim cosine distance estimate: 0.5761921647955006
  Manual cosine distance estimate: 0.8182548880577087


## *Create new average doc vector for SuicideWatch*

## *Calculate DBOW distance from each submission to SuicideWatch*

In [7]:
# Define function to estimate similarities between submissions' PV and SW's PV
def get_sw_sim(data=data, comparison="SuicideWatch", model=model_dbow):
    print("Estimating similarities to SW's PV...")
    print('  Timestamp: {}'.format(datetime.datetime.time(datetime.datetime.now())))
    time_start = round(time.time())
    # Get PV for SuicideWatch
    sw_pv = model.docvecs["SuicideWatch"]
    data_len = data.shape[0]
    # Estimate similarities
    for i,line in data.iterrows():
        if i == 100:
            time_check = round(time.time()-time_start)
            print("  {}s: Processing line {} of {}...".format(time_check,i,data_len))
        if i == 1000:
            time_check = round(time.time()-time_start)
            print("  {}s: Processing line {} of {}...".format(time_check,i,data_len))
        if i == 10000:
            time_check = round(time.time()-time_start)
            print("  {}s: Processing line {} of {}...".format(time_check,i,data_len))
        if i%100000 == 0:
            time_check = round(time.time()-time_start)
            print("  {}s: Processing line {} of {}...".format(time_check,i,data_len))
        # Get submission's tokens in list, infer vector, estimate cosine similarity
        try:
            subm_tokens = line["text_prep"].split()
            inferred_vec = model.infer_vector(subm_tokens, steps=200, alpha=0.025)
            sw_sim = 1-cosine_distance(inferred_vec, sw_pv)
            yield sw_sim
        except:
            print("  Error at line {}:\n{}".format(i,line))
    print("Finished estimating similarities to SW")
    print('  Timestamp: {}'.format(datetime.datetime.time(datetime.datetime.now())))

In [8]:
%%time
# Get the dbow similarities for each submission
sw_dist = list(get_sw_sim(data, "SuicideWatch", model_dbow))

Estimating similarities to SW's PV...
  Timestamp: 11:01:15.618084
  0s: Processing line 0 of 1200579...
  5s: Processing line 100 of 1200579...
  49s: Processing line 1000 of 1200579...
  473s: Processing line 10000 of 1200579...
  4953s: Processing line 100000 of 1200579...
  10057s: Processing line 200000 of 1200579...
  15284s: Processing line 300000 of 1200579...
  20445s: Processing line 400000 of 1200579...
  25777s: Processing line 500000 of 1200579...
  31125s: Processing line 600000 of 1200579...
  36508s: Processing line 700000 of 1200579...
  41839s: Processing line 800000 of 1200579...
  47174s: Processing line 900000 of 1200579...
  52407s: Processing line 1000000 of 1200579...
  57400s: Processing line 1100000 of 1200579...
  62042s: Processing line 1200000 of 1200579...
Finished estimating similarities to SW
  Timestamp: 04:15:46.720666
CPU times: user 17h 13min 41s, sys: 1.66 s, total: 17h 13min 43s
Wall time: 17h 14min 31s


In [None]:
# Add the dbow similarities to the dataframe
data["sw_dist"] = sw_dist

In [None]:
data.head(10)

In [19]:
print("Mean:", data["sw_dist"].mean())
print("SD:  ", data["sw_dist"].std())

Mean: 0.4470653856386173
SD:   0.15465851526514757


In [None]:
data[data["subreddit"] == "SuicideWatch"].head(10)

In [23]:
print("Mean:", data[data["subreddit"] == "SuicideWatch"]["sw_dist"].mean())
print("SD:  ", data[data["subreddit"] == "SuicideWatch"]["sw_dist"].std())

Mean: 0.7693482507628919
SD:   0.06685976453991845


In [None]:
data[data["subreddit"] == "depression"].head(10)

In [24]:
print("Mean:", data[data["subreddit"] == "depression"]["sw_dist"].mean())
print("SD:  ", data[data["subreddit"] == "depression"]["sw_dist"].std())

Mean: 0.7309755063185545
SD:   0.07863054724197381


In [None]:
data[data["subreddit"] == "Anxiety"].head(10)

In [25]:
print("Mean:", data[data["subreddit"] == "Anxiety"]["sw_dist"].mean())
print("SD:  ", data[data["subreddit"] == "Anxiety"]["sw_dist"].std())

Mean: 0.660965822677549
SD:   0.0927849824269911


In [17]:
# Save updated dataframe with dbow distances
data.to_csv(datasource+"sample_main_50elsm_data_prep_dist.csv", index=False)

In [28]:
%%time
# Get the dmm similarities for each submission
sw_dist_dmm = list(get_sw_sim(data, "SuicideWatch", model_dmm))

Estimating similarities to SW's PV...
  Timestamp: 08:25:06.103556
  0s: Processing line 0 of 1200579...
  8s: Processing line 100 of 1200579...
  67s: Processing line 1000 of 1200579...
  647s: Processing line 10000 of 1200579...
  6756s: Processing line 100000 of 1200579...
  13714s: Processing line 200000 of 1200579...
  20821s: Processing line 300000 of 1200579...
  27852s: Processing line 400000 of 1200579...
  35128s: Processing line 500000 of 1200579...
  42417s: Processing line 600000 of 1200579...
  49762s: Processing line 700000 of 1200579...
  57019s: Processing line 800000 of 1200579...
  64329s: Processing line 900000 of 1200579...
  71490s: Processing line 1000000 of 1200579...
  78317s: Processing line 1100000 of 1200579...
  84676s: Processing line 1200000 of 1200579...
Finished estimating similarities to SW
  Timestamp: 07:57:01.538271
CPU times: user 23h 30min 42s, sys: 3.67 s, total: 23h 30min 46s
Wall time: 23h 31min 55s


In [29]:
# Add the dmm similarities to the dataframe
data["sw_dist_dmm"] = sw_dist_dmm

In [None]:
data.head(10)

In [32]:
print("Mean:", data["sw_dist_dmm"].mean())
print("SD:  ", data["sw_dist_dmm"].std())

Mean: 0.3980487593368509
SD:   0.08869275349518795


In [None]:
data[data["subreddit"] == "SuicideWatch"].head(10)

In [34]:
print("Mean:", data[data["subreddit"] == "SuicideWatch"]["sw_dist_dmm"].mean())
print("SD:  ", data[data["subreddit"] == "SuicideWatch"]["sw_dist_dmm"].std())

Mean: 0.49487932934021917
SD:   0.057227256534392035


In [None]:
data[data["subreddit"] == "depression"].head(10)

In [36]:
print("Mean:", data[data["subreddit"] == "depression"]["sw_dist_dmm"].mean())
print("SD:  ", data[data["subreddit"] == "depression"]["sw_dist_dmm"].std())

Mean: 0.48059888200494016
SD:   0.051679910625173


In [None]:
data[data["subreddit"] == "Anxiety"].head(10)

In [38]:
print("Mean:", data[data["subreddit"] == "Anxiety"]["sw_dist_dmm"].mean())
print("SD:  ", data[data["subreddit"] == "Anxiety"]["sw_dist_dmm"].std())

Mean: 0.46543419159674543
SD:   0.04707352189266531


In [31]:
# Save updated dataframe with dbow distances
data.to_csv(datasource+"sample_main_50elsm_data_prep_dist.csv", index=False)