In [2]:
import dataloader
import tomotopy as tp
from itertools import chain
import tqdm
import pandas as pd
import numpy as np

Load new data:

In [None]:
POSTPATH = './crowd/train/shared_task_posts.csv'
LABELPATH = './crowd/train/crowd_train.csv'
USERPATH = './crowd/train/task_C_train.posts.csv'

users = dataloader.load_user_subset_from_train(USERPATH, subset = 1000)
    
user_to_post, post_to_words, post_to_metadata = dataloader.load_posts(POSTPATH, user_subset = users)
post_to_label = dataloader.load_classification(LABELPATH, user_to_post, post_to_words, post_to_metadata)
filtered_data, sw_posts, sw_timestamps = dataloader.filter_posts(post_to_label, post_to_metadata)

filtered_data = dataloader.filter_stopwords(filtered_data)
sw_posts = dataloader.filter_stopwords(sw_posts)

FOLDERPATH = './crowd_processed/'
dataloader.save_to_folder(FOLDERPATH, user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps)

Load existing Data:

In [3]:
FOLDERPATH = './crowd_processed/'
user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = dataloader.load_from_folder(FOLDERPATH)
filtered_data = dataloader.filter_stopwords(filtered_data)
sw_posts = dataloader.filter_stopwords(sw_posts)

In [4]:
mdl = tp.SLDAModel(k=20, vars=['b'])
for data in tqdm.tqdm(filtered_data.keys()):
    mdl.add_doc(chain.from_iterable(filtered_data[data][1]), [1 if filtered_data[data][2] == 'd' else 0])

for i in range(0, 1000, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

100%|████████████████████████████████████████████████████████████████████████| 55255/55255 [00:00<00:00, 128541.82it/s]


Iteration: 0	Log-likelihood: -10.013450645681461
Iteration: 10	Log-likelihood: -9.668326202415573
Iteration: 20	Log-likelihood: -9.500963992163015
Iteration: 30	Log-likelihood: -9.39775293818232
Iteration: 40	Log-likelihood: -9.333379176211249
Iteration: 50	Log-likelihood: -9.283629775282092
Iteration: 60	Log-likelihood: -9.24386138311081
Iteration: 70	Log-likelihood: -9.21294042603233
Iteration: 80	Log-likelihood: -9.188529902639004
Iteration: 90	Log-likelihood: -9.167998155070753
Iteration: 100	Log-likelihood: -9.151604924544978
Iteration: 110	Log-likelihood: -9.13712368922751
Iteration: 120	Log-likelihood: -9.124230035615296
Iteration: 130	Log-likelihood: -9.112448164868102
Iteration: 140	Log-likelihood: -9.103142963716932
Iteration: 150	Log-likelihood: -9.094216126920564
Iteration: 160	Log-likelihood: -9.087215560106115
Iteration: 170	Log-likelihood: -9.08113417810619
Iteration: 180	Log-likelihood: -9.073829805185254
Iteration: 190	Log-likelihood: -9.068347013549074
Iteration: 200	

[('person', 0.06015767157077789), ('url', 0.045288167893886566), ('humble', 0.04114643484354019), ('bundle', 0.02383262664079666), ('2', 0.01880822703242302), ('1', 0.015752848237752914), ('weekly', 0.010456860065460205), ('3', 0.009438401088118553), ('min', 0.008284147828817368), ('x', 0.008012558333575726)] [0.5]
Top 10 words of topic #10
[('bike', 0.01851949468255043), ('͡°', 0.01679852604866028), ('person', 0.016761114820837975), ('car', 0.013506238348782063), ('͜ʖ', 0.00849298108369112), ('new', 0.006584950257092714), ('road', 0.006285651121288538), ('ride', 0.005948939826339483), ('area', 0.005687053315341473), ('bikes', 0.00520069245249033)] [0.5]
Top 10 words of topic #11
[('person', 0.020928271114826202), ('music', 0.018520032986998558), ('song', 0.01275382749736309), ('url', 0.010583020746707916), ('price', 0.007801674772053957), ('would', 0.007733836770057678), ('looking', 0.006818028166890144), ('buy', 0.006648433860391378), ('love', 0.006444920785725117), ('like', 0.005868

In [12]:
slda_coefficients = mdl.get_regression_coef(0)
data = []
for k in range(mdl.k):
    top_words = mdl.get_topic_words(k, top_n=10)
    words = [word for (word, float) in top_words]
    words = ", ".join(words)
    data.append([words, slda_coefficients[k]])
    
indices = np.array(slda_coefficients).argsort()
data = np.array(data)
data = data[indices]

pd.DataFrame(data, columns=["Topic", "Suicidality Coefficient"])

Unnamed: 0,Topic,Suicidality Coefficient
0,"bike, ͡°, person, car, ͜ʖ, new, road, ride, ar...",-5.681143760681152
1,"person, url, humble, bundle, 2, 1, weekly, 3, ...",-3.641441583633423
2,"’, person, “, ”, time, team, use, persons, one...",-3.115659236907959
3,"x, person, 2, 2url, editionurl, 3, game, old, ...",-2.490773916244507
4,"person, music, song, url, price, would, lookin...",-2.3349571228027344
5,"person, new, retail, used, sold, pokemon, colo...",-2.139888048171997
6,"person, url, post, please, one, reddit, subred...",-1.7418127059936523
7,"game, person, play, games, playing, played, li...",-1.644895315170288
8,"person, power, video, card, cpu, atx, amazon, ...",-1.6400138139724731
9,"person, gt, deck, cards, ich, u, und, decks, 0...",-1.468018651008606
