# Install

In [1]:
# Libraries you might not have
# !python3 -m pip install --upgrade nbconvert 
# !python3 -m pip install --upgrade nbstripout 
# !python3 -m pip install tomotopy
# !python3 -m pip install sklearn

# Imports

In [2]:
import nltk
# Things to install from nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/dylfox21/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dylfox21/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
%load_ext autoreload
%autoreload 2

import tomotopy as tp
from itertools import chain
import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict

import dataloader
import bow
import slda
import post_classifier
import aggregate
import user_classifier

# Process Train Data

In [4]:
# Uncomment below to process data if you have not already

# POSTPATH = './Data/crowd/train/shared_task_posts.csv'
# LABELPATH = './Data/crowd/train/crowd_train.csv'
# USERPATH = './Data/crowd/train/task_C_train.posts.csv'

# users = dataloader.load_user_subset_from_train(USERPATH, subset = 1000)
    
# user_to_post, post_to_words, post_to_metadata = dataloader.load_posts(POSTPATH, user_subset = users, append_title = True)
# post_to_label = dataloader.load_classification(LABELPATH, user_to_post, post_to_words, post_to_metadata, user_subset = users)
# filtered_data, sw_posts, sw_timestamps = dataloader.filter_posts(post_to_label, post_to_metadata)
# print(len(filtered_data))
# filtered_data = dataloader.filter_near_SW(filtered_data,post_to_metadata, sw_timestamps)
# print(len(filtered_data))

# filtered_data = dataloader.filter_stopwords(filtered_data)
# sw_posts = dataloader.filter_stopwords(sw_posts)

In [5]:
# FOLDERPATH = './Processing/crowd_processed/'
# dataloader.save_to_folder(FOLDERPATH, user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps)

# Load Process Train Data

In [6]:
FOLDERPATH = './Processing/crowd_processed/'
user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = dataloader.load_from_folder(FOLDERPATH)

In [7]:
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'a']))
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'b']))
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'c']))
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'd']))
print(len(filtered_data))

878
203
632
2025
29200


# SLDA Model: Train

## Train Topic Model

In [8]:
model = slda.train_slda_model_from_data(filtered_data, topics=40)

100%|██████████| 29200/29200 [00:00<00:00, 104417.22it/s]


Beginning sLDA training...
Iteration: 0	Log-likelihood: -10.114199139920588
Iteration: 100	Log-likelihood: -9.148173977060692
Iteration: 200	Log-likelihood: -9.036787163578026
Iteration: 300	Log-likelihood: -9.002368434538326
Iteration: 400	Log-likelihood: -8.979918123882936
Iteration: 500	Log-likelihood: -8.967662217962195
Iteration: 600	Log-likelihood: -8.957691087762342
Iteration: 700	Log-likelihood: -8.94868466773021
Iteration: 800	Log-likelihood: -8.938877119845548
Iteration: 900	Log-likelihood: -8.930469770871628
Finished Training


## Topic Model Outputs

In [9]:
slda_coefficients = model.get_regression_coef(0)
data = []
for k in range(model.k):
    top_words = model.get_topic_words(k, top_n=40)
    words = [word for (word, float) in top_words]
    words = ", ".join(words)
    data.append([words, slda_coefficients[k]])
    
indices = np.array(slda_coefficients).argsort()
data = np.array(data)
data = data[indices]

pd.DataFrame(data, columns=["Topic", "Suicidality Coefficient"])

Unnamed: 0,Topic,Suicidality Coefficient
0,"person, big, babes, girl, sexy, f, teen, hot, ...",-5.741025924682617
1,"order, mail, person, seeds, biodynamic, organi...",-5.261110782623291
2,"url, us, min, survey, gt95, person, short, stu...",-5.080322265625
3,"person, music, song, video, rock, songs, love,...",-4.970279693603516
4,"’, person, “, ”, –, cannabis, marijuana, says,...",-4.694401741027832
5,"person, persons, vs, 2, show, 1, season, 3, te...",-4.436217308044434
6,"person, car, oc, bike, new, dog, x, city, foun...",-4.012282848358154
7,"gt, person, xb1, psn, ps4, vog, invite, x1, lv...",-3.821331739425659
8,"person, trail, gun, completed, treasure, elite...",-3.801435708999634
9,"person, keys, w, h, deck, cards, real, porn, u...",-3.78386664390564


In [10]:
# Uncomment to print example of overly negative topic
# print(data[39][0])

## sLDA Features: Train

In [None]:
vector_train = slda.get_topic_vecs(model, filtered_data)

  0%|          | 3/29200 [00:00<20:57, 23.22it/s]

Getting topic distributions...


 83%|████████▎ | 24340/29200 [14:38<02:58, 27.27it/s]

In [None]:
#Uncomment to print example feature vector
print(vector_train['hw4uh'][0])

# BOW

In [None]:
word2index,index2word = bow.generate_vocabulary(filtered_data)
vector_train = bow.get_PCA_vectors_from_post_set(filtered_data, word2index)

# Post Classifier: Train

## Post Classifier: Train

In [18]:
X_train = np.array([ vector_train[key][0] for key in vector_train.keys()])
y_train = np.array([ vector_train[key][1] for key in vector_train.keys()])
y_train = y_train.reshape(np.shape(y_train)[0])

In [19]:
print(np.shape(X_train))
print(np.shape(y_train))

(29200, 40)
(29200,)


### Logistic Regression

In [86]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("LogReg")
#param_dict = {'C':[0.2,0.5,0.7,1,1.5,2,5]}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

#RUN WITH OPTIMAL PARAMETERS
p_clf = post_classifier.PostClassification("LogReg")
p_clf.train(X_train, y_train)


### Linear SVM

In [None]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("LinearSVM")
#param_dict = {'C':[0.2,0.5,1,2]}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

p_clf = post_classifier.PostClassification("LinearSVM")
p_clf.train(X_train, y_train)

### RBF SVM

In [89]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("RbfSVM")
#param_dict = {'C':[0.5,1,2,5]}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

p_clf = post_classifier.PostClassification("RbfSVM")
p_clf.train(X_train, y_train)

### AdaBoost

In [None]:
p_clf = post_classifier.PostClassification("AdaBoost")
p_clf.train(X_train, y_train)

### Random Forest

In [None]:
p_clf = post_classifier.PostClassification("RandomForest")
p_clf.train(X_train, y_train)

### Multi-layer Perceptron

In [None]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("MLP")
#param_dict = {'hidden_layer_sizes':[(64,64),(64,64,64),(32,32), (32,32,32)], 'learning_rate': ('constant', 'adaptive')}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

p_clf = post_classifier.PostClassification("MLP")
p_clf.train(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] hidden_layer_sizes=(64, 64), learning_rate=constant .............
Iteration 1, loss = 0.36824695


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Iteration 2, loss = 0.22893707
Iteration 3, loss = 0.22040853
Iteration 4, loss = 0.21889144
Iteration 5, loss = 0.21825781
Iteration 6, loss = 0.21825892
Iteration 7, loss = 0.21758994
Iteration 8, loss = 0.21739908
Iteration 9, loss = 0.21706191
Iteration 10, loss = 0.21659998
Iteration 11, loss = 0.21629070
Iteration 12, loss = 0.21598504
Iteration 13, loss = 0.21545088
Iteration 14, loss = 0.21510306
Iteration 15, loss = 0.21480809
Iteration 16, loss = 0.21439595
Iteration 17, loss = 0.21383275
Iteration 18, loss = 0.21339007
Iteration 19, loss = 0.21319979
Iteration 20, loss = 0.21244893
Iteration 21, loss = 0.21198069
Iteration 22, loss = 0.21164398
Iteration 23, loss = 0.21109092
Iteration 24, loss = 0.21090719
Iteration 25, loss = 0.21012951
Iteration 26, loss = 0.20956350
Iteration 27, loss = 0.20917865
Iteration 28, loss = 0.20868152
Iteration 29, loss = 0.20818804
Iteration 30, loss = 0.20780216
Iteration 31, loss = 0.20696069
Iteration 32, loss = 0.20665677
Iteration 33, lo

Iteration 254, loss = 0.15880114
Iteration 255, loss = 0.15880401
Iteration 256, loss = 0.15840758
Iteration 257, loss = 0.15829830
Iteration 258, loss = 0.15799726
Iteration 259, loss = 0.15826974
Iteration 260, loss = 0.15793667
Iteration 261, loss = 0.15895956
Iteration 262, loss = 0.15737259
Iteration 263, loss = 0.15746922
Iteration 264, loss = 0.15829458
Iteration 265, loss = 0.15798226
Iteration 266, loss = 0.15730954
Iteration 267, loss = 0.15807062
Iteration 268, loss = 0.15809292
Iteration 269, loss = 0.15716662
Iteration 270, loss = 0.15695934
Iteration 271, loss = 0.15797640
Iteration 272, loss = 0.15690208
Iteration 273, loss = 0.15669954
Iteration 274, loss = 0.15737728
Iteration 275, loss = 0.15582152
Iteration 276, loss = 0.15625128
Iteration 277, loss = 0.15753511
Iteration 278, loss = 0.15706201
Iteration 279, loss = 0.15560097
Iteration 280, loss = 0.15676705
Iteration 281, loss = 0.15583503
Iteration 282, loss = 0.15562944
Iteration 283, loss = 0.15515345
Iteration 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


Iteration 2, loss = 0.23214084
Iteration 3, loss = 0.22487951
Iteration 4, loss = 0.22341297
Iteration 5, loss = 0.22272585
Iteration 6, loss = 0.22222476
Iteration 7, loss = 0.22189344
Iteration 8, loss = 0.22179030
Iteration 9, loss = 0.22143955
Iteration 10, loss = 0.22107556
Iteration 11, loss = 0.22075692
Iteration 12, loss = 0.22070182
Iteration 13, loss = 0.22033157
Iteration 14, loss = 0.22015171
Iteration 15, loss = 0.21960267
Iteration 16, loss = 0.21963846
Iteration 17, loss = 0.21907751
Iteration 18, loss = 0.21922367
Iteration 19, loss = 0.21837637
Iteration 20, loss = 0.21857110
Iteration 21, loss = 0.21808480
Iteration 22, loss = 0.21769133
Iteration 23, loss = 0.21742890
Iteration 24, loss = 0.21732974
Iteration 25, loss = 0.21697280
Iteration 26, loss = 0.21674200
Iteration 27, loss = 0.21637147
Iteration 28, loss = 0.21607683
Iteration 29, loss = 0.21584724
Iteration 30, loss = 0.21564419
Iteration 31, loss = 0.21545787
Iteration 32, loss = 0.21517074
Iteration 33, lo

Iteration 254, loss = 0.17195532
Iteration 255, loss = 0.17157242
Iteration 256, loss = 0.17261356
Iteration 257, loss = 0.17222155
Iteration 258, loss = 0.17139435
Iteration 259, loss = 0.17155498
Iteration 260, loss = 0.17137432
Iteration 261, loss = 0.17156377
Iteration 262, loss = 0.17061399
Iteration 263, loss = 0.17103026
Iteration 264, loss = 0.17164652
Iteration 265, loss = 0.17133550
Iteration 266, loss = 0.17048023
Iteration 267, loss = 0.17098475
Iteration 268, loss = 0.17146357
Iteration 269, loss = 0.17113650
Iteration 270, loss = 0.17131073
Iteration 271, loss = 0.17061161
Iteration 272, loss = 0.17032521
Iteration 273, loss = 0.17045318
Iteration 274, loss = 0.16970465
Iteration 275, loss = 0.17041549
Iteration 276, loss = 0.17004595
Iteration 277, loss = 0.17082235
Iteration 278, loss = 0.17107649
Iteration 279, loss = 0.16965240
Iteration 280, loss = 0.16883970
Iteration 281, loss = 0.16964305
Iteration 282, loss = 0.16884078
Iteration 283, loss = 0.16897481
Iteration 

Iteration 128, loss = 0.18277705
Iteration 129, loss = 0.18256202
Iteration 130, loss = 0.18215282
Iteration 131, loss = 0.18222005
Iteration 132, loss = 0.18182069
Iteration 133, loss = 0.18202391
Iteration 134, loss = 0.18126549
Iteration 135, loss = 0.18165835
Iteration 136, loss = 0.18053493
Iteration 137, loss = 0.18025502
Iteration 138, loss = 0.18049630
Iteration 139, loss = 0.18118672
Iteration 140, loss = 0.18038641
Iteration 141, loss = 0.17979994
Iteration 142, loss = 0.17990973
Iteration 143, loss = 0.17922561
Iteration 144, loss = 0.18005074
Iteration 145, loss = 0.18078359
Iteration 146, loss = 0.17979672
Iteration 147, loss = 0.17918797
Iteration 148, loss = 0.17963241
Iteration 149, loss = 0.17896963
Iteration 150, loss = 0.17878947
Iteration 151, loss = 0.17868996
Iteration 152, loss = 0.17805542
Iteration 153, loss = 0.17851391
Iteration 154, loss = 0.17785434
Iteration 155, loss = 0.17742639
Iteration 156, loss = 0.17788526
Iteration 157, loss = 0.17747903
Iteration 

Iteration 56, loss = 0.19979824
Iteration 57, loss = 0.19978971
Iteration 58, loss = 0.19990843
Iteration 59, loss = 0.19916065
Iteration 60, loss = 0.19945145
Iteration 61, loss = 0.19911249
Iteration 62, loss = 0.19914383
Iteration 63, loss = 0.19908961
Iteration 64, loss = 0.19814997
Iteration 65, loss = 0.19796747
Iteration 66, loss = 0.19748790
Iteration 67, loss = 0.19759557
Iteration 68, loss = 0.19730181
Iteration 69, loss = 0.19679427
Iteration 70, loss = 0.19696430
Iteration 71, loss = 0.19662646
Iteration 72, loss = 0.19631152
Iteration 73, loss = 0.19603600
Iteration 74, loss = 0.19594780
Iteration 75, loss = 0.19526410
Iteration 76, loss = 0.19608400
Iteration 77, loss = 0.19467938
Iteration 78, loss = 0.19513699
Iteration 79, loss = 0.19504577
Iteration 80, loss = 0.19450747
Iteration 81, loss = 0.19381127
Iteration 82, loss = 0.19417466
Iteration 83, loss = 0.19391629
Iteration 84, loss = 0.19331548
Iteration 85, loss = 0.19336833
Iteration 86, loss = 0.19329495
Iteratio

Iteration 306, loss = 0.16088865
Iteration 307, loss = 0.16055491
Iteration 308, loss = 0.16101788
Iteration 309, loss = 0.16103060
Iteration 310, loss = 0.16050528
Iteration 311, loss = 0.16065627
Iteration 312, loss = 0.16014020
Iteration 313, loss = 0.16022343
Iteration 314, loss = 0.16026129
Iteration 315, loss = 0.16027467
Iteration 316, loss = 0.15947945
Iteration 317, loss = 0.16046310
Iteration 318, loss = 0.15936712
Iteration 319, loss = 0.16012458
Iteration 320, loss = 0.15971187
Iteration 321, loss = 0.16109971
Iteration 322, loss = 0.15918799
Iteration 323, loss = 0.15930296
Iteration 324, loss = 0.15930655
Iteration 325, loss = 0.15894544
Iteration 326, loss = 0.15859451
Iteration 327, loss = 0.15834211
Iteration 328, loss = 0.15907845
Iteration 329, loss = 0.15876576
Iteration 330, loss = 0.16124925
Iteration 331, loss = 0.15941370
Iteration 332, loss = 0.15947375
Iteration 333, loss = 0.15885699
Iteration 334, loss = 0.15846611
Iteration 335, loss = 0.15759798
Iteration 

Iteration 191, loss = 0.17938070
Iteration 192, loss = 0.17956558
Iteration 193, loss = 0.17986204
Iteration 194, loss = 0.17945859
Iteration 195, loss = 0.17918660
Iteration 196, loss = 0.17922607
Iteration 197, loss = 0.17829530
Iteration 198, loss = 0.17847867
Iteration 199, loss = 0.17838214
Iteration 200, loss = 0.17926368
Iteration 201, loss = 0.17932125
Iteration 202, loss = 0.17869066
Iteration 203, loss = 0.17754535
Iteration 204, loss = 0.17768264
Iteration 205, loss = 0.17743168
Iteration 206, loss = 0.17880693
Iteration 207, loss = 0.17716185
Iteration 208, loss = 0.17784834
Iteration 209, loss = 0.17716774
Iteration 210, loss = 0.17720730
Iteration 211, loss = 0.17766218
Iteration 212, loss = 0.17629344
Iteration 213, loss = 0.17751558
Iteration 214, loss = 0.17647178
Iteration 215, loss = 0.17677287
Iteration 216, loss = 0.17673416
Iteration 217, loss = 0.17615256
Iteration 218, loss = 0.17637654
Iteration 219, loss = 0.17705823
Iteration 220, loss = 0.17674993
Iteration 

Iteration 50, loss = 0.20674838
Iteration 51, loss = 0.20644924
Iteration 52, loss = 0.20595442
Iteration 53, loss = 0.20536080
Iteration 54, loss = 0.20534440
Iteration 55, loss = 0.20492950
Iteration 56, loss = 0.20511321
Iteration 57, loss = 0.20439727
Iteration 58, loss = 0.20431999
Iteration 59, loss = 0.20383293
Iteration 60, loss = 0.20394491
Iteration 61, loss = 0.20367052
Iteration 62, loss = 0.20411166
Iteration 63, loss = 0.20278547
Iteration 64, loss = 0.20302747
Iteration 65, loss = 0.20235471
Iteration 66, loss = 0.20286833
Iteration 67, loss = 0.20222853
Iteration 68, loss = 0.20197429
Iteration 69, loss = 0.20191245
Iteration 70, loss = 0.20209143
Iteration 71, loss = 0.20118084
Iteration 72, loss = 0.20095704
Iteration 73, loss = 0.20069088
Iteration 74, loss = 0.20067060
Iteration 75, loss = 0.20009794
Iteration 76, loss = 0.20027676
Iteration 77, loss = 0.19989546
Iteration 78, loss = 0.19983483
Iteration 79, loss = 0.19967451
Iteration 80, loss = 0.19953411
Iteratio

Iteration 300, loss = 0.16907541
Iteration 301, loss = 0.16943238
Iteration 302, loss = 0.16885875
Iteration 303, loss = 0.16848660
Iteration 304, loss = 0.16872328
Iteration 305, loss = 0.16931549
Iteration 306, loss = 0.16937111
Iteration 307, loss = 0.17088661
Iteration 308, loss = 0.16869524
Iteration 309, loss = 0.16821631
Iteration 310, loss = 0.16821276
Iteration 311, loss = 0.16854466
Iteration 312, loss = 0.16871546
Iteration 313, loss = 0.16869744
Iteration 314, loss = 0.16781013
Iteration 315, loss = 0.16823224
Iteration 316, loss = 0.16950090
Iteration 317, loss = 0.16756053
Iteration 318, loss = 0.16788896
Iteration 319, loss = 0.16821214
Iteration 320, loss = 0.16699105
Iteration 321, loss = 0.16720248
Iteration 322, loss = 0.16711072
Iteration 323, loss = 0.16710231
Iteration 324, loss = 0.16695945
Iteration 325, loss = 0.16733396
Iteration 326, loss = 0.16766547
Iteration 327, loss = 0.16649355
Iteration 328, loss = 0.16672157
Iteration 329, loss = 0.16649716
Iteration 

Iteration 174, loss = 0.18414820
Iteration 175, loss = 0.18421890
Iteration 176, loss = 0.18435299
Iteration 177, loss = 0.18330962
Iteration 178, loss = 0.18420601
Iteration 179, loss = 0.18366662
Iteration 180, loss = 0.18288103
Iteration 181, loss = 0.18422365
Iteration 182, loss = 0.18321631
Iteration 183, loss = 0.18290396
Iteration 184, loss = 0.18200825
Iteration 185, loss = 0.18184699
Iteration 186, loss = 0.18248430
Iteration 187, loss = 0.18207673
Iteration 188, loss = 0.18188539
Iteration 189, loss = 0.18182277
Iteration 190, loss = 0.18249358
Iteration 191, loss = 0.18143361
Iteration 192, loss = 0.18108571
Iteration 193, loss = 0.18069941
Iteration 194, loss = 0.18096941
Iteration 195, loss = 0.18076701
Iteration 196, loss = 0.18165316
Iteration 197, loss = 0.18049764
Iteration 198, loss = 0.18021107
Iteration 199, loss = 0.18043310
Iteration 200, loss = 0.18002942
Iteration 201, loss = 0.17999377
Iteration 202, loss = 0.18015726
Iteration 203, loss = 0.17959186
Iteration 



[CV]  hidden_layer_sizes=(64, 64), learning_rate=adaptive, total= 1.7min
[CV] hidden_layer_sizes=(64, 64), learning_rate=adaptive .............
Iteration 1, loss = 0.31223825
Iteration 2, loss = 0.22617403
Iteration 3, loss = 0.21890934
Iteration 4, loss = 0.21764773
Iteration 5, loss = 0.21662273
Iteration 6, loss = 0.21629037
Iteration 7, loss = 0.21617581
Iteration 8, loss = 0.21580223
Iteration 9, loss = 0.21552441
Iteration 10, loss = 0.21521679
Iteration 11, loss = 0.21483230
Iteration 12, loss = 0.21461490
Iteration 13, loss = 0.21448959
Iteration 14, loss = 0.21391643
Iteration 15, loss = 0.21385336
Iteration 16, loss = 0.21348906
Iteration 17, loss = 0.21299888
Iteration 18, loss = 0.21292718
Iteration 19, loss = 0.21237526
Iteration 20, loss = 0.21178890
Iteration 21, loss = 0.21180432
Iteration 22, loss = 0.21113120
Iteration 23, loss = 0.21119871
Iteration 24, loss = 0.21050133
Iteration 25, loss = 0.21073455
Iteration 26, loss = 0.20988229
Iteration 27, loss = 0.20914248
I



[CV]  hidden_layer_sizes=(64, 64), learning_rate=adaptive, total= 1.1min
[CV] hidden_layer_sizes=(64, 64), learning_rate=adaptive .............
Iteration 1, loss = 0.37623324
Iteration 2, loss = 0.23087038
Iteration 3, loss = 0.21858153
Iteration 4, loss = 0.21612527
Iteration 5, loss = 0.21538929
Iteration 6, loss = 0.21511283
Iteration 7, loss = 0.21481833
Iteration 8, loss = 0.21460459
Iteration 9, loss = 0.21436784
Iteration 10, loss = 0.21427118
Iteration 11, loss = 0.21393753
Iteration 12, loss = 0.21370543
Iteration 13, loss = 0.21342655
Iteration 14, loss = 0.21321574
Iteration 15, loss = 0.21296165
Iteration 16, loss = 0.21279821
Iteration 17, loss = 0.21232935
Iteration 18, loss = 0.21198430
Iteration 19, loss = 0.21167135
Iteration 20, loss = 0.21132749
Iteration 21, loss = 0.21096463
Iteration 22, loss = 0.21048862
Iteration 23, loss = 0.21027032
Iteration 24, loss = 0.20954774
Iteration 25, loss = 0.20967606
Iteration 26, loss = 0.20901770
Iteration 27, loss = 0.20907690
I

Iteration 249, loss = 0.17378411
Iteration 250, loss = 0.17305625
Iteration 251, loss = 0.17411623
Iteration 252, loss = 0.17338157
Iteration 253, loss = 0.17238619
Iteration 254, loss = 0.17210380
Iteration 255, loss = 0.17240820
Iteration 256, loss = 0.17241443
Iteration 257, loss = 0.17279507
Iteration 258, loss = 0.17319396
Iteration 259, loss = 0.17229717
Iteration 260, loss = 0.17310149
Iteration 261, loss = 0.17176741
Iteration 262, loss = 0.17158357
Iteration 263, loss = 0.17191392
Iteration 264, loss = 0.17210143
Iteration 265, loss = 0.17211808
Iteration 266, loss = 0.17084808
Iteration 267, loss = 0.17086043
Iteration 268, loss = 0.17112860
Iteration 269, loss = 0.17184578
Iteration 270, loss = 0.17108999
Iteration 271, loss = 0.17098642
Iteration 272, loss = 0.17077288
Iteration 273, loss = 0.17121797
Iteration 274, loss = 0.17147577
Iteration 275, loss = 0.17049087
Iteration 276, loss = 0.17096830
Iteration 277, loss = 0.17071333
Iteration 278, loss = 0.17076859
Iteration 

Iteration 174, loss = 0.18293863
Iteration 175, loss = 0.18329508
Iteration 176, loss = 0.18352534
Iteration 177, loss = 0.18368467
Iteration 178, loss = 0.18255009
Iteration 179, loss = 0.18264852
Iteration 180, loss = 0.18298456
Iteration 181, loss = 0.18356248
Iteration 182, loss = 0.18228352
Iteration 183, loss = 0.18274494
Iteration 184, loss = 0.18397217
Iteration 185, loss = 0.18283440
Iteration 186, loss = 0.18214872
Iteration 187, loss = 0.18138280
Iteration 188, loss = 0.18206229
Iteration 189, loss = 0.18174267
Iteration 190, loss = 0.18202912
Iteration 191, loss = 0.18202999
Iteration 192, loss = 0.18148447
Iteration 193, loss = 0.18117151
Iteration 194, loss = 0.18063326
Iteration 195, loss = 0.18086532
Iteration 196, loss = 0.18164023
Iteration 197, loss = 0.18068566
Iteration 198, loss = 0.18098390
Iteration 199, loss = 0.18124462
Iteration 200, loss = 0.18053791
Iteration 201, loss = 0.18046231
Iteration 202, loss = 0.17957761
Iteration 203, loss = 0.18053655
Iteration 

Iteration 31, loss = 0.20552219
Iteration 32, loss = 0.20496328
Iteration 33, loss = 0.20412238
Iteration 34, loss = 0.20363095
Iteration 35, loss = 0.20352433
Iteration 36, loss = 0.20341958
Iteration 37, loss = 0.20348741
Iteration 38, loss = 0.20232122
Iteration 39, loss = 0.20225661
Iteration 40, loss = 0.20146634
Iteration 41, loss = 0.20140815
Iteration 42, loss = 0.20035477
Iteration 43, loss = 0.19979352
Iteration 44, loss = 0.19987826
Iteration 45, loss = 0.19922052
Iteration 46, loss = 0.19927265
Iteration 47, loss = 0.19912206
Iteration 48, loss = 0.19777579
Iteration 49, loss = 0.19746240
Iteration 50, loss = 0.19784299
Iteration 51, loss = 0.19733427
Iteration 52, loss = 0.19834248
Iteration 53, loss = 0.19630163
Iteration 54, loss = 0.19588398
Iteration 55, loss = 0.19648793
Iteration 56, loss = 0.19503059
Iteration 57, loss = 0.19539352
Iteration 58, loss = 0.19480287
Iteration 59, loss = 0.19363355
Iteration 60, loss = 0.19429888
Iteration 61, loss = 0.19241274
Iteratio

Iteration 282, loss = 0.13162930
Iteration 283, loss = 0.13232354
Iteration 284, loss = 0.13435933
Iteration 285, loss = 0.13364438
Iteration 286, loss = 0.13113208
Iteration 287, loss = 0.13220038
Iteration 288, loss = 0.13253562
Iteration 289, loss = 0.13189907
Iteration 290, loss = 0.13411658
Iteration 291, loss = 0.13044585
Iteration 292, loss = 0.13165749
Iteration 293, loss = 0.13086019
Iteration 294, loss = 0.13034308
Iteration 295, loss = 0.13107526
Iteration 296, loss = 0.13206147
Iteration 297, loss = 0.13253340
Iteration 298, loss = 0.12958483
Iteration 299, loss = 0.13000208
Iteration 300, loss = 0.12879790
Iteration 301, loss = 0.12954639
Iteration 302, loss = 0.12924742
Iteration 303, loss = 0.13034955
Iteration 304, loss = 0.12979011
Iteration 305, loss = 0.13053122
Iteration 306, loss = 0.12801418
Iteration 307, loss = 0.12995550
Iteration 308, loss = 0.12902951
Iteration 309, loss = 0.13000869
Iteration 310, loss = 0.12927427
Iteration 311, loss = 0.12791597
Iteration 

Iteration 145, loss = 0.14882878
Iteration 146, loss = 0.14944535
Iteration 147, loss = 0.14922673
Iteration 148, loss = 0.14812981
Iteration 149, loss = 0.14754559
Iteration 150, loss = 0.14743546
Iteration 151, loss = 0.14711118
Iteration 152, loss = 0.14834379
Iteration 153, loss = 0.14832298
Iteration 154, loss = 0.14591361
Iteration 155, loss = 0.14544293
Iteration 156, loss = 0.14533795
Iteration 157, loss = 0.14668139
Iteration 158, loss = 0.14443316
Iteration 159, loss = 0.14584922
Iteration 160, loss = 0.14348619
Iteration 161, loss = 0.14390378
Iteration 162, loss = 0.14482668
Iteration 163, loss = 0.14271552
Iteration 164, loss = 0.14389493
Iteration 165, loss = 0.14443954
Iteration 166, loss = 0.14264834
Iteration 167, loss = 0.14179557
Iteration 168, loss = 0.14150406
Iteration 169, loss = 0.14167588
Iteration 170, loss = 0.14209921
Iteration 171, loss = 0.14181892
Iteration 172, loss = 0.14070517
Iteration 173, loss = 0.13935992
Iteration 174, loss = 0.14000892
Iteration 

Iteration 109, loss = 0.16858143
Iteration 110, loss = 0.16882577
Iteration 111, loss = 0.16786015
Iteration 112, loss = 0.16909013
Iteration 113, loss = 0.16827338
Iteration 114, loss = 0.16793405
Iteration 115, loss = 0.16695040
Iteration 116, loss = 0.16703854
Iteration 117, loss = 0.16711084
Iteration 118, loss = 0.16575677
Iteration 119, loss = 0.16528796
Iteration 120, loss = 0.16588271
Iteration 121, loss = 0.16542897
Iteration 122, loss = 0.16467630
Iteration 123, loss = 0.16375427
Iteration 124, loss = 0.16374826
Iteration 125, loss = 0.16317650
Iteration 126, loss = 0.16325070
Iteration 127, loss = 0.16331545
Iteration 128, loss = 0.16357865
Iteration 129, loss = 0.16214951
Iteration 130, loss = 0.16096009
Iteration 131, loss = 0.16162991
Iteration 132, loss = 0.16089965
Iteration 133, loss = 0.15961214
Iteration 134, loss = 0.16000577
Iteration 135, loss = 0.16039013
Iteration 136, loss = 0.15998346
Iteration 137, loss = 0.15899112
Iteration 138, loss = 0.15849732
Iteration 



[CV]  hidden_layer_sizes=(64, 64, 64), learning_rate=constant, total=  50.0s
[CV] hidden_layer_sizes=(64, 64, 64), learning_rate=constant .........
Iteration 1, loss = 0.27112018
Iteration 2, loss = 0.22029094
Iteration 3, loss = 0.21644787
Iteration 4, loss = 0.21540868
Iteration 5, loss = 0.21501156
Iteration 6, loss = 0.21426839
Iteration 7, loss = 0.21369422
Iteration 8, loss = 0.21394044
Iteration 9, loss = 0.21310852
Iteration 10, loss = 0.21219489
Iteration 11, loss = 0.21208231
Iteration 12, loss = 0.21178974
Iteration 13, loss = 0.21076592
Iteration 14, loss = 0.20973732
Iteration 15, loss = 0.20970450
Iteration 16, loss = 0.20861060




[CV]  hidden_layer_sizes=(64, 64, 64), learning_rate=constant, total=   4.5s
[CV] hidden_layer_sizes=(64, 64, 64), learning_rate=constant .........
Iteration 1, loss = 0.27944298
Iteration 2, loss = 0.22545159
Iteration 3, loss = 0.22152250
Iteration 4, loss = 0.22060780
Iteration 5, loss = 0.22005381
Iteration 6, loss = 0.21962768
Iteration 7, loss = 0.21924166
Iteration 8, loss = 0.21889472
Iteration 9, loss = 0.21867534
Iteration 10, loss = 0.21794870
Iteration 11, loss = 0.21771808
Iteration 12, loss = 0.21697924
Iteration 13, loss = 0.21665664
Iteration 14, loss = 0.21631434
Iteration 15, loss = 0.21596737
Iteration 16, loss = 0.21521972
Iteration 17, loss = 0.21442772
Iteration 18, loss = 0.21403365
Iteration 19, loss = 0.21377381
Iteration 20, loss = 0.21292027
Iteration 21, loss = 0.21229092
Iteration 22, loss = 0.21171847
Iteration 23, loss = 0.21101377
Iteration 24, loss = 0.21061803
Iteration 25, loss = 0.21033032
Iteration 26, loss = 0.20961726
Iteration 27, loss = 0.209285

Iteration 249, loss = 0.14160223
Iteration 250, loss = 0.14132450
Iteration 251, loss = 0.14104346
Iteration 252, loss = 0.13989801
Iteration 253, loss = 0.14169607
Iteration 254, loss = 0.14157201
Iteration 255, loss = 0.14085691
Iteration 256, loss = 0.13971462
Iteration 257, loss = 0.13939176
Iteration 258, loss = 0.13972223
Iteration 259, loss = 0.14077287
Iteration 260, loss = 0.13990293
Iteration 261, loss = 0.13926337
Iteration 262, loss = 0.13856078
Iteration 263, loss = 0.13873035
Iteration 264, loss = 0.14272488
Iteration 265, loss = 0.14078976
Iteration 266, loss = 0.14030420
Iteration 267, loss = 0.13986931
Iteration 268, loss = 0.14085776
Iteration 269, loss = 0.13748969
Iteration 270, loss = 0.13822947
Iteration 271, loss = 0.13686530
Iteration 272, loss = 0.13616575
Iteration 273, loss = 0.13784127
Iteration 274, loss = 0.13818781
Iteration 275, loss = 0.13692580
Iteration 276, loss = 0.13685067
Iteration 277, loss = 0.13972064
Iteration 278, loss = 0.13688000
Iteration 



[CV]  hidden_layer_sizes=(64, 64, 64), learning_rate=adaptive, total=   6.9s
[CV] hidden_layer_sizes=(64, 64, 64), learning_rate=adaptive .........
Iteration 1, loss = 0.35481696
Iteration 2, loss = 0.22936476
Iteration 3, loss = 0.22399871
Iteration 4, loss = 0.22337091
Iteration 5, loss = 0.22261523
Iteration 6, loss = 0.22224596
Iteration 7, loss = 0.22158103
Iteration 8, loss = 0.22150451
Iteration 9, loss = 0.22119142
Iteration 10, loss = 0.22037666
Iteration 11, loss = 0.22010300
Iteration 12, loss = 0.21940108
Iteration 13, loss = 0.21904189
Iteration 14, loss = 0.21872194
Iteration 15, loss = 0.21795867
Iteration 16, loss = 0.21805599
Iteration 17, loss = 0.21716263
Iteration 18, loss = 0.21633298
Iteration 19, loss = 0.21659343
Iteration 20, loss = 0.21549661
Iteration 21, loss = 0.21474561
Iteration 22, loss = 0.21492138
Iteration 23, loss = 0.21396217
Iteration 24, loss = 0.21307799
Iteration 25, loss = 0.21373568
Iteration 26, loss = 0.21220239
Iteration 27, loss = 0.211701

## Predict Post Classifier: Train

In [90]:
y_pred_train = p_clf.test(X_train)

In [91]:
p_clf.get_metrics(y_train, y_pred_train)

{'accuracy': 0.7092808219178082,
 'precision': 0.16769483857701006,
 'recall': 0.8054320987654321,
 'f1': 0.27759339630669727}

In [92]:
print(sum(y_pred_train))
print(sum(y_train))

9726
2025


# User Classfier: Train

## Aggregate: Train

In [93]:
# change y from a, b, c, d, control to -, 1
user_to_y_train = defaultdict(int)
for data in tqdm.tqdm(filtered_data.keys()):
    user_to_y_train[filtered_data[data][0]] = (1 if filtered_data[data][2] == 'd' else 0)

100%|██████████| 29200/29200 [00:00<00:00, 1048459.31it/s]


In [94]:
post_to_uypred_train = defaultdict(list)

for i, post_id in enumerate(vector_train.keys()):
    user_id = filtered_data[post_id][0]
    post_to_uypred_train[post_id] = [user_id, y_pred_train[i]]

In [95]:
user_to_post_label_train = aggregate.aggregate_posts(FOLDERPATH, post_to_uypred_train)

Saved aggreagation of user to post labels...


## Argmax: Train

In [96]:
u_clf_train = user_classifier.UserClassification(user_to_post_label_train)
user_to_ypred_train = u_clf_train.argmax()

In [97]:
user_y_train = []
user_y_pred_train = []
for user_id in user_to_ypred_train:
    user_y_train.append(user_to_y_train[user_id])
    user_y_pred_train.append(user_to_ypred_train[user_id])

In [98]:
u_clf_train.get_metrics(user_y_train, user_y_pred_train)

{'accuracy': 0.7214452214452215,
 'precision': 0.3967828418230563,
 'recall': 0.9135802469135802,
 'f1': 0.5532710280373832}

# Process Data: Test

In [35]:
POSTPATH2 = './Data/crowd/test/shared_task_posts_test.csv'
LABELPATH2 = './Data/crowd/test/crowd_test_C.csv'
USERPATH2 = './Data/crowd/test/task_C_test.posts.csv'
    
user_to_post_test, post_to_words_test, post_to_metadata_test = dataloader.load_posts(POSTPATH2, append_title = True)
post_to_label_test = dataloader.load_classification(LABELPATH2, user_to_post_test, post_to_words_test, post_to_metadata_test)
filtered_data_test, sw_posts_test, sw_timestamps_test = dataloader.filter_posts(post_to_label_test, post_to_metadata_test)
print(len(filtered_data_test))
filtered_data_test = dataloader.filter_near_SW(filtered_data_test, post_to_metadata_test, sw_timestamps_test)
print(len(filtered_data_test))

filtered_data_test = dataloader.filter_stopwords(filtered_data_test)
sw_posts_test = dataloader.filter_stopwords(sw_posts_test)

 13%|█▎        | 1910/14447 [00:00<00:01, 7915.41it/s]

Tokenizing sentences...


100%|██████████| 14447/14447 [00:01<00:00, 10524.51it/s]
  1%|          | 130/14447 [00:00<00:11, 1290.87it/s]

Normalizing...
Tokenizing sentences into words...


100%|██████████| 14447/14447 [00:06<00:00, 2244.42it/s]
100%|██████████| 14056/14056 [00:00<00:00, 946112.96it/s]

249
14056
Filtering posts far away from SW posts...
5637





In [36]:
FOLDERPATH2 = './Processing/crowd_processed_test/'
dataloader.save_to_folder(FOLDERPATH2, user_to_post_test, post_to_metadata_test, filtered_data_test, sw_posts_test, sw_timestamps_test)

# Load Process Data: Test

In [37]:
FOLDERPATH2 = './Processing/crowd_processed_test/'
user_to_post_test, post_to_metadata_test, filtered_data_test, sw_posts_test, sw_timestamps_test = dataloader.load_from_folder(FOLDERPATH2)

In [38]:
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'a']))
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'b']))
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'c']))
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'd']))
print(len(filtered_data_test))

4922
58
257
400
5637


# Feature Extraction: Test

## sLDA

In [39]:
X_test, y_test = slda.vectorize_data_set(model, FOLDERPATH2)

  0%|          | 3/5637 [00:00<03:31, 26.65it/s]

Getting topic distributions...


100%|██████████| 5637/5637 [03:13<00:00, 29.06it/s]


In [40]:
vector_test = slda.get_topic_vecs(model, filtered_data_test)

  0%|          | 3/5637 [00:00<04:13, 22.19it/s]

Getting topic distributions...


100%|██████████| 5637/5637 [03:12<00:00, 29.22it/s]


# Post Classifier: Test

## Predict Post Classifier: Test

In [99]:
y_pred_test = p_clf.test(X_test)

In [100]:
p_clf.get_metrics(y_test, y_pred_test)

{'accuracy': 0.5887883626042221,
 'precision': 0.09260832625318606,
 'recall': 0.545,
 'f1': 0.15831517792302105}

In [101]:
print(sum(y_pred_test))
print(sum(y_test))

2354
[400]


# User Classifier: Test

## Aggregate: Test

In [102]:
# change y from a, b, c, d, control to -, 1
user_to_y_test = defaultdict(int)
for data in tqdm.tqdm(filtered_data_test.keys()):
    user_to_y_test[filtered_data_test[data][0]] = (1 if filtered_data_test[data][2] == 'd' else 0)

100%|██████████| 5637/5637 [00:00<00:00, 1238776.68it/s]


In [103]:
len(filtered_data_test)

5637

In [104]:
len(vector_test)

5637

In [105]:
post_to_uypred_test = defaultdict(list)

for i, post_id in enumerate(vector_test.keys()):
    user_id = filtered_data_test[post_id][0]
    post_to_uypred_test[post_id] = [user_id, y_pred_test[i]]

In [106]:
user_to_post_label_test = aggregate.aggregate_posts(FOLDERPATH2, post_to_uypred_test)

Saved aggreagation of user to post labels...


## Argmax: Test

In [107]:
u_clf_test = user_classifier.UserClassification(user_to_post_label_test)
user_to_ypred_test = u_clf_test.argmax()

In [108]:
user_y_test = []
user_y_pred_test = []
for user_id in user_to_ypred_test:
    user_y_test.append(user_to_y_test[user_id])
    user_y_pred_test.append(user_to_ypred_test[user_id])

In [109]:
u_clf_test.get_metrics(user_y_test, user_y_pred_test)

{'accuracy': 0.5981308411214953,
 'precision': 0.24742268041237114,
 'recall': 0.6486486486486487,
 'f1': 0.3582089552238806}

## Threshod: Test

In [None]:
u_clf_test = user_classifier.UserClassification(user_to_post_label_test)

In [None]:
u_clf_test.find_threshold(user_to_y_test)

In [None]:
user_to_ypred_test = u_clf_test.minimum(1)

In [None]:
user_y_test = []
user_y_pred_test = []
for user_id in user_to_ypred_test:
    user_y_test.append(user_to_y_test[user_id])
    user_y_pred_test.append(user_to_ypred_test[user_id])

In [None]:
u_clf_test.get_metrics(user_y_test, user_y_pred_test)