Load new data:

In [2]:
import dataloader
import tomotopy as tp
from itertools import chain
import tqdm
import pandas as pd
import numpy as np

In [8]:
POSTPATH = './Data/crowd/train/shared_task_posts.csv'
LABELPATH = './Data/crowd/train/crowd_train.csv'
USERPATH = './Data/crowd/train/task_C_train.posts.csv'

users = dataloader.load_user_subset_from_train(USERPATH, subset = 1000)
    
user_to_post, post_to_words, post_to_metadata = dataloader.load_posts(POSTPATH, user_subset = users)
post_to_label = dataloader.load_classification(LABELPATH, user_to_post, post_to_words, post_to_metadata)
filtered_data, sw_posts, sw_timestamps = dataloader.filter_posts(post_to_label, post_to_metadata)

filtered_data = dataloader.filter_stopwords(filtered_data)
sw_posts = dataloader.filter_stopwords(sw_posts)

FOLDERPATH = './Processing/crowd_processed/'
dataloader.save_to_folder(FOLDERPATH, user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps)

  1%|          | 14057/2038753 [00:00<00:28, 69868.77it/s]

Filtering subset...


100%|██████████| 2038753/2038753 [00:29<00:00, 69807.49it/s]
  1%|▏         | 726/57015 [00:00<00:07, 7250.33it/s]

Tokenizing sentences...


100%|██████████| 57015/57015 [00:08<00:00, 6391.88it/s] 
 20%|██        | 11413/57015 [00:00<00:00, 114119.50it/s]

Normalizing...


100%|██████████| 57015/57015 [00:00<00:00, 124927.12it/s]
  1%|          | 439/57015 [00:00<00:13, 4351.50it/s]

Tokenizing sentences into words...


100%|██████████| 57015/57015 [00:16<00:00, 3487.62it/s]


Load existing Data:

In [3]:
FOLDERPATH = './Processing/crowd_processed/'
user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = dataloader.load_from_folder(FOLDERPATH)
filtered_data = dataloader.filter_stopwords(filtered_data)
sw_posts = dataloader.filter_stopwords(sw_posts)

In [11]:
mdl = tp.SLDAModel(k=40, vars=['b'])
for data in tqdm.tqdm(filtered_data.keys()):
    mdl.add_doc(chain.from_iterable(filtered_data[data][1]), [1 if filtered_data[data][2] == 'd' else 0])

for i in range(0, 1000, 10):
    mdl.train(10)
    if (i % 100 == 0):
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
print("Training Finished")

100%|██████████| 55255/55255 [00:00<00:00, 93909.05it/s]


Iteration: 0	Log-likelihood: -10.347582116250646
Iteration: 100	Log-likelihood: -9.317130772595707
Iteration: 200	Log-likelihood: -9.211626170340445
Iteration: 300	Log-likelihood: -9.170996120587702
Iteration: 400	Log-likelihood: -9.14813595326642
Iteration: 500	Log-likelihood: -9.131948165019477
Iteration: 600	Log-likelihood: -9.118063947763508
Iteration: 700	Log-likelihood: -9.11108379192932
Iteration: 800	Log-likelihood: -9.111597682879996
Iteration: 900	Log-likelihood: -9.105244334403562
Training Finished


In [12]:
slda_coefficients = mdl.get_regression_coef(0)
data = []
for k in range(mdl.k):
    top_words = mdl.get_topic_words(k, top_n=40)
    words = [word for (word, float) in top_words]
    words = ", ".join(words)
    data.append([words, slda_coefficients[k]])
    
indices = np.array(slda_coefficients).argsort()
data = np.array(data)
data = data[indices]

pd.DataFrame(data, columns=["Topic", "Suicidality Coefficient"])

Unnamed: 0,Topic,Suicidality Coefficient
0,"bike, car, new, road, ride, around, city, bike...",-4.572435855865479
1,"url, us, min, survey, person, gt95, study, sho...",-4.415285587310791
2,"person, big, babes, porn, f, girl, sexy, xpost...",-4.101504325866699
3,"’, ”, “, person, ‘, –, gun, 10, top, pistol, r...",-4.050198078155518
4,"͡°, ͜ʖ, battery, person, high, screen, vs, low...",-3.814165353775024
5,"english, live, webcam, language, sexy, word, g...",-3.1262660026550293
6,"phone, app, person, use, iphone, mobile, apps,...",-2.8841357231140137
7,"2, person, x, 2url, editionurl, 3, game, dont,...",-2.4462270736694336
8,"person, us, state, new, police, states, countr...",-2.4319872856140137
9,"person, humble, w, bundle, keys, h, mail, 2, o...",-2.386512279510498


In [13]:
print(data[37][0])

im, like, dont, know, get, people, really, think, want, something, cant, ive, even, feel, make, going, go, say, one, see, time, good, someone, doesnt, would, right, much, thing, things, person, thats, anything, youre, bad, lot, sure, always, well, way, else


In [14]:
print(data[38][0])

im, pain, ive, doctor, hospital, mental, health, days, weeks, feel, take, anxiety, person, also, medication, back, taking, issues, depression, last, disorder, since, appointment, even, help, ago, day, started, drug, bipolar, bad, went, normal, years, symptoms, doctors, meds, medical, blood, week


In [15]:
print(data[39][0])

im, life, feel, want, dont, know, like, ive, love, friends, never, things, really, people, even, cant, time, always, years, talk, much, going, relationship, one, friend, someone, get, help, ever, best, would, way, feeling, still, could, need, go, day, everything, family
