In [1]:
# Install dependencies
#!pip install texthero

In [2]:
# Source for code: https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db
# Import dependencies
import pandas as pd
import numpy as np

# Text analysis
import texthero as hero
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
# Load data
df = pd.read_csv("./preproc_data/clean_text_all_subreddits.csv")

# Display
df.head()

Unnamed: 0,subreddit,date,author,id,num_comments,score,clean_text_seq,clean_text,link_flair_text,compound_sent
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"recently diagnosed, need to talk to others who...",recently diagnosed need talk others diagnosed ...,,0.9954
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,really annoyed at my familys drunk friends so ...,really annoyed family drunk friend family host...,,-0.9354
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,the medication journey: a current disappointme...,medication journey current disappointment want...,,0.9233
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"wearables, rem sleep detected while gaming not...",wearable rem sleep detected gaming sleeping hy...,,0.7738
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,picking a friend up to carpool to a nye party....,picking friend carpool nye party texted on way...,,0.8591


In [4]:
# Doc2Vec
# Tokenize and tag the text
text_docs = [TaggedDocument(doc.split(' '), [i]) 
             for i, doc in enumerate(df["clean_text"])]

# Display the tagged docs
text_docs

[TaggedDocument(words=['recently', 'diagnosed', 'need', 'talk', 'others', 'diagnosed', 'well', 'im', 'm', 'recently', 'diagnosed', 'adhd', 'month', 'ago', 'ive', 'always', 'felt', 'like', 'it', 'growing', 'house', 'never', 'really', 'went', 'doctor', 'unless', 'missing', 'body', 'part', 'dying', 'i', 'issue', 'late', 'teen', 'early', 'twenty', 'drug', 'use', 'always', 'chasing', 'something', 'clear', 'head', 'eventually', 'landed', 'meth', 'go', 'figure', 'almost', 'year', 'clean', 'working', 'improving', 'mental', 'health', 'so', 'that', 'therapist', 'said', 'see', 'psychiatrist', 'adhd', 'session', 'displayed', 'trait', 'someone', 'adhd', 'doe', 'decided', 'psychiatrist', 'diagnosed', 'adhd', 'were', 'working', 'finding', 'right', 'med', 'started', 'straterra', 'past', 'drug', 'problem', 'wanted', 'try', 'non', 'stimulates', 'first', 'month', 'last', 'checkup', 'talked', 'working', 'wasnt', 'sure', 'helping', 'much', 'id', 'hope', 'notice', 'small', 'improvement', 'focus', 'could', '

In [6]:
# Source for Hyperparameter tuning
# https://medium.com/betacom/hyperparameters-tuning-tf-idf-and-doc2vec-models-73dd418b4d

# Instantiate model
model = Doc2Vec(vector_size=64, window=2, min_count=1, workers=8, epochs=20)

# Build vocab
model.build_vocab(text_docs)

# Train model
model.train(text_docs, total_examples=model.corpus_count, epochs=model.epochs)

# Generate vectors
post2vec = [model.infer_vector((df['clean_text'][i].split(' '))) for i in range(0,len(df['clean_text']))]

# Display
post2vec

[array([ 1.27345055e-01,  8.15616310e-01,  5.67151487e-01, -2.24583790e-01,
        -1.27271974e+00, -1.49275482e-01, -1.24383605e+00, -1.86204035e-02,
        -2.13251591e-01,  5.13118446e-01, -4.43767756e-01,  3.54599267e-01,
         7.53728986e-01,  9.40482616e-02, -6.71257973e-02, -3.27474594e-01,
         8.06503415e-01, -4.04199883e-02, -8.94483984e-01,  7.49679357e-02,
        -8.07351589e-01, -2.38446712e-01,  1.69830456e-01, -6.22102201e-01,
        -2.28641834e-02,  1.03681493e+00, -1.25534177e-01,  2.01365933e-01,
         1.43960118e-02, -1.02893817e+00,  2.26917043e-01, -4.73502949e-02,
        -2.77281970e-01, -6.01733267e-01,  4.11897898e-01, -5.06018363e-02,
         1.32064160e-03,  4.64755744e-01, -7.89630353e-01,  6.53520167e-01,
        -1.63802542e-02, -1.00300714e-01,  3.71835113e-01,  2.43684500e-01,
         1.95635892e-02, -2.36641333e-01, -3.39741170e-01, -1.35344163e-01,
        -2.38853902e-01, -1.95894331e-01, -1.36425838e-01,  2.00922275e+00,
         1.6

In [7]:
# Set list to dataframe column
df['post2vec'] = np.array(post2vec).tolist()

# Display
df

Unnamed: 0,subreddit,date,author,id,num_comments,score,clean_text_seq,clean_text,link_flair_text,compound_sent,post2vec
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"recently diagnosed, need to talk to others who...",recently diagnosed need talk others diagnosed ...,,0.9954,"[0.12734505534172058, 0.8156163096427917, 0.56..."
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,really annoyed at my familys drunk friends so ...,really annoyed family drunk friend family host...,,-0.9354,"[0.4540468752384186, -0.19051940739154816, -0...."
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,the medication journey: a current disappointme...,medication journey current disappointment want...,,0.9233,"[0.42680078744888306, 0.6896241903305054, -0.6..."
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"wearables, rem sleep detected while gaming not...",wearable rem sleep detected gaming sleeping hy...,,0.7738,"[-1.1856372356414795, -0.2498205155134201, 0.4..."
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,picking a friend up to carpool to a nye party....,picking friend carpool nye party texted on way...,,0.8591,"[0.027474194765090942, -0.6478607654571533, 0...."
...,...,...,...,...,...,...,...,...,...,...,...
492121,sad,31/12/2020,RussianPower69,ko0jbp,3.0,1,me sad me has no one to talk to on new year,sad one talk new year,Depression/Sadness,-0.6486,"[0.1995467245578766, 0.10068118572235107, 0.19..."
492122,sad,31/12/2020,prettygirlolivia,ko0lec,8.0,1,worst new years eve ever my depression has hit...,worst new year eve ever depression hit bad las...,,-0.9439,"[1.0305132865905762, 0.7160326242446899, 0.749..."
492123,sad,31/12/2020,DirtyLizard0032,ko0rtl,2.0,1,check out my sad song,check sad song,Depression/Sadness,-0.4767,"[0.06328876316547394, -0.2522851526737213, 0.1..."
492124,sad,31/12/2020,Music-SunsetGirl490,ko0skv,3.0,1,zoom wedding tomorrow! so here is the story. l...,zoom wedding tomorrow story let call groom bob...,Loneliness,-0.6508,"[0.4180046319961548, -0.7512432932853699, 0.59..."


In [8]:
# Drop clean_text_seq
df.drop(["clean_text_seq"], axis=1, inplace=True)

# Display
df

Unnamed: 0,subreddit,date,author,id,num_comments,score,clean_text,link_flair_text,compound_sent,post2vec
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,recently diagnosed need talk others diagnosed ...,,0.9954,"[0.12734505534172058, 0.8156163096427917, 0.56..."
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,really annoyed family drunk friend family host...,,-0.9354,"[0.4540468752384186, -0.19051940739154816, -0...."
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,medication journey current disappointment want...,,0.9233,"[0.42680078744888306, 0.6896241903305054, -0.6..."
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,wearable rem sleep detected gaming sleeping hy...,,0.7738,"[-1.1856372356414795, -0.2498205155134201, 0.4..."
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,picking friend carpool nye party texted on way...,,0.8591,"[0.027474194765090942, -0.6478607654571533, 0...."
...,...,...,...,...,...,...,...,...,...,...
492121,sad,31/12/2020,RussianPower69,ko0jbp,3.0,1,sad one talk new year,Depression/Sadness,-0.6486,"[0.1995467245578766, 0.10068118572235107, 0.19..."
492122,sad,31/12/2020,prettygirlolivia,ko0lec,8.0,1,worst new year eve ever depression hit bad las...,,-0.9439,"[1.0305132865905762, 0.7160326242446899, 0.749..."
492123,sad,31/12/2020,DirtyLizard0032,ko0rtl,2.0,1,check sad song,Depression/Sadness,-0.4767,"[0.06328876316547394, -0.2522851526737213, 0.1..."
492124,sad,31/12/2020,Music-SunsetGirl490,ko0skv,3.0,1,zoom wedding tomorrow story let call groom bob...,Loneliness,-0.6508,"[0.4180046319961548, -0.7512432932853699, 0.59..."


In [10]:
# Visualize Vectors
df['tsneP2V'] = hero.tsne(df['post2vec'])

In [12]:
# Save df
df.to_csv("./preproc_data/clean_text_all_subreddits_d2v.csv", index=False)

In [19]:
# Create scatter plot of doc2vec
hero.scatterplot(df[:10000], col='tsneP2V', title="Doc2Vec", hover_data=['subreddit', 'compound_sent', 'link_flair_text', 'id'])

In [24]:
 # Create scatter plot of doc2vec for r/mentalhealth
hero.scatterplot(df[df["subreddit"]=="mentalhealth"][:10000], col='tsneP2V', title="Doc2Vec", hover_data=['compound_sent', 'link_flair_text', 'id'])

In [33]:
df[df['id'] == "83vne3"] # Depressed
df[df['id'] == "7nxt31"] # Depressed
df[df['id'] == "8ftmlq"] # Depressed

Unnamed: 0,subreddit,date,author,id,num_comments,score,clean_text,link_flair_text,compound_sent,post2vec,tsneP2V
261865,mentalhealth,29/04/2018,Zoopato,8ftmlq,2.0,2,extreme depressed morning noon normal afternoo...,,-0.8153,"[0.24274009466171265, 0.6134068369865417, 0.68...","[5.633365617541131e-06, -3.3634560168138705e-06]"
