In [None]:
import json
import pprint

import numpy as np
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess
from gensim import corpora, models
import pandas as pd
from sklearn.manifold import TSNE
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import sys
sys.path.insert(1, '../src/utils/')

from vectorize import preprocessing

# Variables


In [None]:
num_topics = 2
use_title = True
stemming = True
lemmatization = True
lib = "gensim"

# Data loading and preparation

## load

In [None]:
with open('..\\src\\data\\data_jmlr_vol17.json') as f:
    data = json.load(f)
data_df = pd.json_normalize(data['papers'])
corpus = data_df["abstract"]
if use_title:
    corpus = data_df["title"] + " " + corpus 


## preprocess

In [None]:
tokenized = preprocessing(
    corpus,
    lib=lib,
    stemming=stemming,
    lemmatization=lemmatization,
    min_word_len=2,
    max_word_len=15
)

In [None]:
dictionary = corpora.Dictionary(tokenized) 
BoW_corpus = [dictionary.doc2bow(text) for text in tokenized]

In [None]:
tfidf = models.TfidfModel(BoW_corpus)
corpus_tfidf = tfidf[BoW_corpus]

# LSI

## TFIDF

In [125]:
lsi_tfidf = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)# train model
lsi_tfidf[corpus_tfidf[1]]  # apply model to  document

  and should_run_async(code)


[(0, 0.19748731304362163), (1, 0.012308062091104625)]

## Bag of Words

In [126]:
lsi_bow = models.LsiModel(BoW_corpus, id2word=dictionary, num_topics=num_topics)
lsi_bow[BoW_corpus[1]]  # apply model to  document

  and should_run_async(code)


[(0, 4.116027747143777), (1, 0.48894783052050245)]

# LDA

## init

In [127]:
# LDA model training 
lda_model = models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

  and should_run_async(code)


# Inspect

## Keywords

In [128]:
for keyword in data_df["keywords"]:
    print(keyword)

['multi-armed bandit', 'best-arm identification', 'pure exploration', 'information-     theoretic divergences', '']
['dictionary learning', 'multi-resolution analysis', 'manifold learning', 'robustness', 'sparsityc 2016 Mauro Maggioni', 'Stanislav Minsker', '']
['clustering', 'time series', 'ergodicity', '']
['feature rotation', 'ensemble diversity', '']
['statistical comparison', 'Friedman test', '']
['permutation estimation', 'minimax rate of separation', '']
['Markov chain Monte Carlo', 'Langevin dynamics', '']
['deep learning', 'neural networks', 'optimization', 'evolution of culture', 'curriculum     learning', '']
['string kernels', 'string distances', '']
['Gibbs sampling', 'herding', '']
[]
['Subspace clustering', 'robustness', 'stability', 'compressive sensing', '']
['Reinforcement learning', 'Markov decision processes', 'variance estimation', 'simulation', '']
['Statistical consistency', 'multiclass loss', 'loss matrix', 'surrogate loss', 'convex     surrogates', 'calibrated 

  and should_run_async(code)


In [129]:
print(f"The Dataset contains {len(data_df)} Papers")
count_keywords = 0
all_keywords = []
for keyword in data_df["keywords"]:
    if keyword and keyword[0]:
        count_keywords += 1
        all_keywords = all_keywords + keyword
print(f"{count_keywords} of them contain Keywords.")
print(f"There are {len(all_keywords)} Keywords. {len(set(all_keywords))} of them are unique.")


The Dataset contains 236 Papers
203 of them contain Keywords.
There are 1023 Keywords. 707 of them are unique.


  and should_run_async(code)


## Corpus

In [130]:
pprint.pprint(dictionary.token2id)#token -> tokenId.

  and should_run_async(code)


{"b'abernethy/jj": 2340,
 "b'ability/nn": 1203,
 "b'able/jj": 977,
 "b'abnormal/jj": 2844,
 "b'absence/nn": 2078,
 "b'absent/jj": 2845,
 "b'abstraction/nn": 0,
 "b'accelerate/nn": 2050,
 "b'accelerate/vb": 2309,
 "b'accelerometer/nn": 2846,
 "b'accelerometry/nn": 2847,
 "b'accept/vb": 2409,
 "b'acceptance/nn": 298,
 "b'access/nn": 1465,
 "b'accessible/jj": 2174,
 "b'accident/nn": 3118,
 "b'accommodate/vb": 2029,
 "b'accompany/vb": 860,
 "b'accomplish/vb": 2979,
 "b'accord/vb": 766,
 "b'accordance/nn": 2701,
 "b'account/nn": 654,
 "b'accuracy/nn": 153,
 "b'accurate/jj": 154,
 "b'accurately/rb": 1646,
 "b'achievable/jj": 1,
 "b'achieve/vb": 155,
 "b'acquire/vb": 1721,
 "b'acquisition/nn": 2610,
 "b'act/nn": 1915,
 "b'action/nn": 707,
 "b'activation/nn": 2433,
 "b'active/jj": 1910,
 "b'activity/nn": 2332,
 "b'actor/nn": 1546,
 "b'actual/jj": 708,
 "b'acyclic/jj": 926,
 "b'ad/nn": 2903,
 "b'adaboost/rb": 1596,
 "b'adaframe/jj": 2434,
 "b'adapt/jj": 1547,
 "b'adapt/vb": 1370,
 "b'adaptation

 "b'feasible/jj": 1407,
 "b'feature/nn": 172,
 "b'feedback/nn": 996,
 "b'feedforward/rb": 2325,
 "b'feller/nn": 3043,
 "b'fewer/jj": 2446,
 "b'field/nn": 1097,
 "b'file/nn": 2194,
 "b'filter/nn": 1534,
 "b'filter/vb": 1452,
 "b'final/jj": 2491,
 "b'finally/rb": 267,
 "b'finance/nn": 573,
 "b'financial/jj": 1074,
 "b'find/vb": 1897,
 "b'finding/nn": 118,
 "b'fine/nn": 3158,
 "b'finer/jj": 913,
 "b'finite/jj": 638,
 "b'first/jj": 1276,
 "b'firstly/rb": 2355,
 "b'fisher/jj": 1501,
 "b'fit/jj": 897,
 "b'fit/vb": 1906,
 "b'fitting/jj": 2646,
 "b'fix/vb": 34,
 "b'flag/nn": 3081,
 "b'flat/jj": 1960,
 "b'flda/nn": 2714,
 "b'flexible/jj": 1030,
 "b'flexibly/rb": 2266,
 "b'flip/nn": 1976,
 "b'flow/nn": 2267,
 "b'fm/nn": 2792,
 "b'fmq/nn": 1791,
 "b'fmri/nn": 2212,
 "b'focal/jj": 2150,
 "b'focus/nn": 734,
 "b'focus/vb": 1321,
 "b'focuse/vb": 1341,
 "b'fogd/nn": 1249,
 "b'fold/nn": 2690,
 "b'follow/jj": 799,
 "b'follow/vb": 119,
 "b'foot/vb": 1357,
 "b'forbid/vb": 2456,
 "b'force/nn": 2020,
 "b'fo

 "b'performance/nn": 66,
 "b'periodic/jj": 2986,
 "b'permit/vb": 950,
 "b'permutation/nn": 284,
 "b'perrault/jj": 2519,
 "b'persistent/jj": 2866,
 "b'person/nn": 1821,
 "b'personal/jj": 672,
 "b'personalize/vb": 1822,
 "b'perspective/nn": 826,
 "b'pertain/vb": 2185,
 "b'perturbation/nn": 2588,
 "b'perturbed/jj": 2580,
 "b'pesc/nn": 2617,
 "b'pessimistic/jj": 2944,
 "b'petuum/nn": 1713,
 "b'pfa/nn": 2815,
 "b'pgdc/nn": 2501,
 "b'phase/nn": 285,
 "b'phc/nn": 1800,
 "b'phenomenon/nn": 1263,
 "b'phenotyp/vb": 2816,
 "b'phenotype/nn": 2817,
 "b'phone/nn": 2734,
 "b'physics/nn": 3011,
 "b'pick/nn": 1676,
 "b'pico/nn": 2377,
 "b'picture/nn": 2097,
 "b'pipeline/nn": 1055,
 "b'pixel/nn": 2450,
 "b'place/nn": 137,
 "b'plain/rb": 2200,
 "b'plan/vb": 1929,
 "b'planar/jj": 3012,
 "b'planarity/nn": 3013,
 "b'platform/nn": 1056,
 "b'play/nn": 1662,
 "b'play/vb": 951,
 "b'plug/nn": 2998,
 "b'pmc/nn": 1303,
 "b'point/nn": 138,
 "b'pointwise/nn": 2765,
 "b'poisson/nn": 1381,
 "b'policy/nn": 580,
 "b'pol

 "b'vito/nn": 2559,
 "b'vmf/nn": 2935,
 "b'volume/nn": 3094,
 "b'volumetric/jj": 2256,
 "b'von/nn": 2936,
 "b'voting/nn": 2420,
 "b'walk/nn": 959,
 "b'wall/nn": 2621,
 "b'wang/vb": 1609,
 "b'want/vb": 688,
 "b'warm/jj": 3075,
 "b'wasting/nn": 2074,
 "b'wavelet/vb": 2439,
 "b'way/nn": 150,
 "b'wc/vb": 3006,
 "b'weak/jj": 151,
 "b'weak\\xc3\\xa2/vb": 1398,
 "b'weakly/rb": 2104,
 "b'wealth/nn": 765,
 "b'web/nn": 1720,
 "b'weight/nn": 1384,
 "b'weight/vb": 632,
 "b'weighting/nn": 728,
 "b'weka/vb": 457,
 "b'whilst/vb": 960,
 "b'white/jj": 3047,
 "b'wide/jj": 1066,
 "b'widely/rb": 1229,
 "b'widespread/jj": 1270,
 "b'width/nn": 2644,
 "b'wifi/nn": 2114,
 "b'wilcoxon/nn": 246,
 "b'wilk/nn": 1514,
 "b'window/nn": 3095,
 "b'winner/nn": 2454,
 "b'wise/jj": 1464,
 "b'wisely/rb": 3027,
 "b'wish/nn": 2991,
 "b'wishart/nn": 2970,
 "b'wolf/nn": 1805,
 "b'word/nn": 247,
 "b'work/nn": 152,
 "b'work/vb": 689,
 "b'worker/nn": 2015,
 "b'workload/nn": 2075,
 "b'world/nn": 652,
 "b'worst/jj": 812,
 "b'wrang

In [131]:
pprint.pprint(dictionary.dfs) # token_id -> how many documents contain this token.

  and should_run_async(code)


{0: 1,
 1: 2,
 2: 20,
 3: 11,
 4: 70,
 5: 4,
 6: 4,
 7: 3,
 8: 23,
 9: 10,
 10: 6,
 11: 8,
 12: 2,
 13: 12,
 14: 21,
 15: 25,
 16: 31,
 17: 4,
 18: 46,
 19: 5,
 20: 21,
 21: 4,
 22: 47,
 23: 15,
 24: 1,
 25: 2,
 26: 9,
 27: 43,
 28: 5,
 29: 39,
 30: 48,
 31: 1,
 32: 36,
 33: 1,
 34: 15,
 35: 47,
 36: 5,
 37: 31,
 38: 41,
 39: 7,
 40: 7,
 41: 8,
 42: 16,
 43: 41,
 44: 15,
 45: 9,
 46: 33,
 47: 41,
 48: 6,
 49: 29,
 50: 114,
 51: 3,
 52: 17,
 53: 17,
 54: 19,
 55: 52,
 56: 13,
 57: 23,
 58: 15,
 59: 89,
 60: 25,
 61: 3,
 62: 9,
 63: 26,
 64: 23,
 65: 34,
 66: 68,
 67: 31,
 68: 2,
 69: 4,
 70: 84,
 71: 8,
 72: 2,
 73: 9,
 74: 7,
 75: 79,
 76: 10,
 77: 8,
 78: 3,
 79: 6,
 80: 38,
 81: 16,
 82: 1,
 83: 32,
 84: 4,
 85: 21,
 86: 8,
 87: 10,
 88: 20,
 89: 2,
 90: 4,
 91: 6,
 92: 33,
 93: 8,
 94: 8,
 95: 40,
 96: 8,
 97: 13,
 98: 5,
 99: 71,
 100: 40,
 101: 8,
 102: 10,
 103: 2,
 104: 27,
 105: 15,
 106: 12,
 107: 14,
 108: 29,
 109: 25,
 110: 17,
 111: 125,
 112: 11,
 113: 1,
 114: 34,
 115: 

 1168: 4,
 1169: 4,
 1170: 11,
 1171: 3,
 1172: 8,
 1173: 6,
 1174: 5,
 1175: 1,
 1176: 1,
 1177: 4,
 1178: 1,
 1179: 7,
 1180: 1,
 1181: 5,
 1182: 4,
 1183: 7,
 1184: 1,
 1185: 7,
 1186: 5,
 1187: 8,
 1188: 3,
 1189: 1,
 1190: 1,
 1191: 2,
 1192: 3,
 1193: 2,
 1194: 6,
 1195: 1,
 1196: 6,
 1197: 2,
 1198: 3,
 1199: 2,
 1200: 1,
 1201: 9,
 1202: 7,
 1203: 4,
 1204: 3,
 1205: 8,
 1206: 6,
 1207: 2,
 1208: 4,
 1209: 4,
 1210: 8,
 1211: 4,
 1212: 5,
 1213: 15,
 1214: 3,
 1215: 2,
 1216: 1,
 1217: 2,
 1218: 4,
 1219: 1,
 1220: 5,
 1221: 2,
 1222: 1,
 1223: 5,
 1224: 2,
 1225: 1,
 1226: 5,
 1227: 3,
 1228: 2,
 1229: 12,
 1230: 10,
 1231: 1,
 1232: 1,
 1233: 6,
 1234: 8,
 1235: 3,
 1236: 6,
 1237: 1,
 1238: 1,
 1239: 1,
 1240: 5,
 1241: 25,
 1242: 6,
 1243: 10,
 1244: 7,
 1245: 7,
 1246: 1,
 1247: 1,
 1248: 8,
 1249: 1,
 1250: 2,
 1251: 1,
 1252: 1,
 1253: 3,
 1254: 6,
 1255: 2,
 1256: 3,
 1257: 3,
 1258: 6,
 1259: 5,
 1260: 1,
 1261: 7,
 1262: 4,
 1263: 2,
 1264: 4,
 1265: 1,
 1266: 6,
 126

 2249: 4,
 2250: 1,
 2251: 1,
 2252: 1,
 2253: 5,
 2254: 1,
 2255: 1,
 2256: 1,
 2257: 1,
 2258: 3,
 2259: 1,
 2260: 1,
 2261: 1,
 2262: 2,
 2263: 3,
 2264: 3,
 2265: 1,
 2266: 1,
 2267: 1,
 2268: 2,
 2269: 1,
 2270: 3,
 2271: 1,
 2272: 1,
 2273: 5,
 2274: 1,
 2275: 1,
 2276: 1,
 2277: 5,
 2278: 5,
 2279: 1,
 2280: 1,
 2281: 1,
 2282: 1,
 2283: 2,
 2284: 2,
 2285: 1,
 2286: 1,
 2287: 1,
 2288: 1,
 2289: 1,
 2290: 1,
 2291: 1,
 2292: 1,
 2293: 2,
 2294: 1,
 2295: 1,
 2296: 2,
 2297: 1,
 2298: 1,
 2299: 1,
 2300: 1,
 2301: 2,
 2302: 1,
 2303: 1,
 2304: 2,
 2305: 1,
 2306: 1,
 2307: 1,
 2308: 1,
 2309: 2,
 2310: 1,
 2311: 9,
 2312: 3,
 2313: 1,
 2314: 1,
 2315: 2,
 2316: 2,
 2317: 2,
 2318: 1,
 2319: 1,
 2320: 4,
 2321: 2,
 2322: 4,
 2323: 1,
 2324: 2,
 2325: 1,
 2326: 10,
 2327: 3,
 2328: 2,
 2329: 1,
 2330: 1,
 2331: 1,
 2332: 3,
 2333: 2,
 2334: 2,
 2335: 2,
 2336: 1,
 2337: 1,
 2338: 3,
 2339: 1,
 2340: 1,
 2341: 1,
 2342: 1,
 2343: 1,
 2344: 1,
 2345: 2,
 2346: 1,
 2347: 2,
 2348: 2,

In [132]:
pprint.pprint(BoW_corpus)# list of (token_id, token_count) 

  and should_run_async(code)


[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 4),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 1),
  (20, 4),
  (21, 4),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 7),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 2),
  (50, 1),
  (51, 3),
  (52, 1),
  (53, 1),
  (54, 2),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 3),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 2),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 3),
  (81, 2),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

  (75, 1),
  (104, 1),
  (111, 3),
  (140, 1),
  (141, 2),
  (150, 1),
  (152, 2),
  (158, 1),
  (160, 1),
  (172, 1),
  (180, 1),
  (184, 1),
  (192, 1),
  (198, 1),
  (199, 1),
  (200, 3),
  (215, 2),
  (238, 1),
  (262, 2),
  (274, 1),
  (275, 1),
  (289, 2),
  (291, 1),
  (350, 1),
  (385, 1),
  (394, 1),
  (398, 1),
  (399, 1),
  (455, 1),
  (475, 2),
  (501, 1),
  (502, 1),
  (506, 1),
  (543, 3),
  (548, 1),
  (626, 1),
  (638, 1),
  (748, 1),
  (763, 1),
  (783, 1),
  (797, 1),
  (805, 1),
  (834, 1),
  (1040, 2),
  (1076, 7),
  (1100, 1),
  (1104, 1),
  (1119, 1),
  (1120, 1),
  (1121, 1),
  (1122, 1),
  (1123, 1),
  (1124, 1),
  (1125, 1),
  (1126, 1),
  (1127, 1),
  (1128, 1),
  (1129, 1),
  (1130, 4),
  (1131, 2),
  (1132, 1),
  (1133, 1),
  (1134, 1),
  (1135, 1),
  (1136, 1),
  (1137, 1)],
 [(15, 2),
  (16, 1),
  (18, 1),
  (20, 1),
  (49, 1),
  (50, 2),
  (54, 1),
  (131, 1),
  (133, 1),
  (140, 1),
  (152, 1),
  (170, 1),
  (249, 1),
  (253, 1),
  (287, 2),
  (295, 1),


  (247, 1),
  (286, 1),
  (291, 3),
  (400, 1),
  (416, 1),
  (447, 1),
  (475, 1),
  (544, 1),
  (561, 1),
  (679, 1),
  (748, 1),
  (758, 1),
  (850, 1),
  (858, 2),
  (1075, 1),
  (1077, 1),
  (1194, 2),
  (1198, 1),
  (1366, 3),
  (1515, 1),
  (1516, 1),
  (1517, 1),
  (1518, 1),
  (1519, 1),
  (1520, 1),
  (1521, 1),
  (1522, 1),
  (1523, 1),
  (1524, 1),
  (1525, 2),
  (1526, 1),
  (1527, 1)],
 [(4, 1),
  (46, 1),
  (50, 1),
  (56, 3),
  (57, 1),
  (109, 1),
  (111, 3),
  (140, 1),
  (144, 1),
  (153, 1),
  (157, 3),
  (160, 1),
  (163, 1),
  (180, 2),
  (196, 3),
  (200, 1),
  (203, 1),
  (232, 1),
  (291, 1),
  (346, 1),
  (349, 1),
  (353, 1),
  (355, 1),
  (385, 2),
  (398, 3),
  (399, 2),
  (416, 1),
  (444, 1),
  (445, 1),
  (448, 1),
  (449, 1),
  (472, 1),
  (485, 1),
  (489, 2),
  (514, 1),
  (538, 1),
  (601, 1),
  (698, 1),
  (714, 3),
  (776, 1),
  (799, 1),
  (849, 1),
  (881, 1),
  (905, 1),
  (911, 1),
  (939, 1),
  (1010, 2),
  (1274, 1),
  (1321, 1),
  (1528, 1),

  (104, 1),
  (140, 1),
  (143, 4),
  (180, 2),
  (182, 1),
  (197, 2),
  (199, 2),
  (204, 1),
  (277, 2),
  (283, 1),
  (291, 4),
  (324, 1),
  (362, 1),
  (368, 1),
  (370, 1),
  (372, 1),
  (382, 1),
  (398, 1),
  (410, 1),
  (417, 1),
  (444, 1),
  (456, 1),
  (478, 1),
  (498, 1),
  (503, 2),
  (506, 1),
  (524, 1),
  (541, 1),
  (551, 1),
  (580, 10),
  (586, 1),
  (627, 1),
  (652, 3),
  (703, 1),
  (707, 2),
  (768, 2),
  (776, 1),
  (807, 1),
  (836, 1),
  (858, 1),
  (1005, 1),
  (1066, 1),
  (1163, 1),
  (1164, 2),
  (1201, 1),
  (1213, 1),
  (1223, 1),
  (1244, 7),
  (1259, 1),
  (1293, 1),
  (1315, 2),
  (1369, 1),
  (1416, 1),
  (1463, 1),
  (1556, 1),
  (1786, 1),
  (1892, 1),
  (1893, 1),
  (1894, 1),
  (1895, 1),
  (1896, 1),
  (1897, 1),
  (1898, 2),
  (1899, 1),
  (1900, 1),
  (1901, 1),
  (1902, 1),
  (1903, 1)],
 [(22, 1),
  (59, 2),
  (66, 1),
  (111, 2),
  (133, 1),
  (140, 2),
  (141, 1),
  (150, 1),
  (157, 1),
  (180, 1),
  (192, 1),
  (196, 1),
  (197, 1),
 

  (2258, 1),
  (2259, 1),
  (2260, 1),
  (2261, 3),
  (2262, 1),
  (2263, 1)],
 [(8, 1),
  (15, 1),
  (26, 2),
  (29, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (50, 2),
  (54, 1),
  (59, 6),
  (60, 3),
  (64, 1),
  (70, 1),
  (75, 1),
  (111, 4),
  (116, 1),
  (126, 1),
  (138, 1),
  (140, 1),
  (141, 1),
  (143, 1),
  (147, 1),
  (160, 1),
  (177, 1),
  (180, 2),
  (184, 1),
  (215, 1),
  (256, 1),
  (266, 1),
  (270, 1),
  (316, 1),
  (320, 1),
  (392, 1),
  (402, 1),
  (407, 1),
  (468, 1),
  (583, 1),
  (626, 2),
  (630, 3),
  (637, 1),
  (644, 1),
  (652, 1),
  (669, 1),
  (694, 1),
  (701, 1),
  (742, 1),
  (755, 1),
  (776, 1),
  (849, 5),
  (850, 1),
  (883, 1),
  (894, 1),
  (908, 1),
  (929, 1),
  (1030, 1),
  (1035, 1),
  (1126, 1),
  (1164, 1),
  (1192, 1),
  (1202, 2),
  (1209, 3),
  (1211, 2),
  (1503, 1),
  (1547, 1),
  (1684, 1),
  (1833, 1),
  (1919, 1),
  (1934, 1),
  (2264, 1),
  (2265, 1),
  (2266, 1),
  (2267, 1),
  (2268, 1),
  (2269, 1),
  (2270, 1),
  (2271, 1),
  

  (209, 1),
  (215, 1),
  (216, 1),
  (249, 1),
  (257, 2),
  (275, 2),
  (286, 1),
  (287, 2),
  (290, 4),
  (293, 1),
  (302, 2),
  (308, 1),
  (350, 1),
  (427, 2),
  (430, 1),
  (459, 1),
  (478, 1),
  (501, 1),
  (558, 1),
  (612, 1),
  (618, 2),
  (629, 2),
  (637, 1),
  (683, 1),
  (694, 1),
  (713, 1),
  (746, 1),
  (868, 1),
  (921, 2),
  (952, 1),
  (1123, 1),
  (1129, 1),
  (1152, 1),
  (1264, 1),
  (1275, 1),
  (1570, 1),
  (1640, 1),
  (1903, 1),
  (1994, 1),
  (2162, 1),
  (2253, 1),
  (2278, 1),
  (2311, 1),
  (2528, 1),
  (2529, 1),
  (2530, 1),
  (2531, 1),
  (2532, 5),
  (2533, 3)],
 [(18, 1),
  (29, 2),
  (30, 1),
  (41, 2),
  (67, 1),
  (83, 2),
  (85, 2),
  (96, 1),
  (100, 2),
  (104, 1),
  (108, 7),
  (119, 1),
  (120, 1),
  (131, 1),
  (176, 1),
  (184, 1),
  (192, 1),
  (267, 1),
  (326, 2),
  (335, 2),
  (462, 1),
  (468, 1),
  (490, 2),
  (595, 1),
  (603, 1),
  (614, 1),
  (615, 1),
  (644, 5),
  (660, 4),
  (797, 1),
  (799, 1),
  (805, 1),
  (841, 1),
  (8

  (763, 1),
  (816, 1),
  (849, 6),
  (853, 1),
  (858, 3),
  (868, 1),
  (1027, 1),
  (1029, 1),
  (1104, 1),
  (1530, 1),
  (1553, 1),
  (1604, 1),
  (1839, 2),
  (2028, 1),
  (2111, 1),
  (2163, 1),
  (2202, 1),
  (2326, 1),
  (2395, 1),
  (2422, 1),
  (2555, 1),
  (2594, 1),
  (2735, 1),
  (2736, 1),
  (2737, 1),
  (2738, 2),
  (2739, 1)],
 [(22, 1),
  (67, 1),
  (70, 1),
  (99, 1),
  (140, 1),
  (160, 1),
  (253, 1),
  (259, 1),
  (284, 1),
  (332, 1),
  (401, 1),
  (455, 1),
  (506, 1),
  (532, 1),
  (554, 1),
  (555, 1),
  (563, 1),
  (583, 1),
  (609, 2),
  (727, 1),
  (749, 1),
  (993, 1),
  (1271, 1),
  (1609, 1),
  (1622, 2),
  (1880, 1),
  (2418, 1),
  (2740, 1),
  (2741, 1),
  (2742, 1)],
 [(22, 1),
  (27, 1),
  (32, 1),
  (52, 1),
  (66, 2),
  (70, 1),
  (80, 1),
  (81, 1),
  (83, 1),
  (85, 1),
  (93, 1),
  (111, 1),
  (123, 1),
  (131, 1),
  (133, 1),
  (199, 1),
  (221, 1),
  (230, 1),
  (280, 1),
  (281, 1),
  (290, 1),
  (291, 1),
  (301, 1),
  (324, 1),
  (330, 2),


  (2931, 1),
  (2932, 1),
  (2933, 1),
  (2934, 3),
  (2935, 2),
  (2936, 1)],
 [(9, 1),
  (13, 2),
  (18, 1),
  (22, 1),
  (49, 2),
  (50, 1),
  (57, 1),
  (66, 1),
  (100, 1),
  (120, 2),
  (122, 1),
  (136, 1),
  (140, 1),
  (141, 1),
  (157, 1),
  (160, 1),
  (166, 1),
  (196, 2),
  (210, 1),
  (221, 2),
  (234, 9),
  (256, 1),
  (263, 1),
  (267, 1),
  (300, 1),
  (378, 1),
  (406, 1),
  (488, 1),
  (513, 1),
  (557, 1),
  (593, 2),
  (603, 1),
  (613, 2),
  (632, 1),
  (688, 1),
  (702, 1),
  (717, 1),
  (812, 2),
  (837, 1),
  (862, 1),
  (905, 3),
  (983, 1),
  (997, 2),
  (1025, 1),
  (1029, 1),
  (1218, 3),
  (1242, 3),
  (1364, 1),
  (1475, 2),
  (2208, 3),
  (2312, 1),
  (2482, 1),
  (2595, 1),
  (2748, 4),
  (2855, 2),
  (2890, 3),
  (2931, 1),
  (2937, 1),
  (2938, 1),
  (2939, 1),
  (2940, 2),
  (2941, 2),
  (2942, 2),
  (2943, 1),
  (2944, 1),
  (2945, 4),
  (2946, 1)],
 [(4, 1),
  (14, 2),
  (22, 1),
  (27, 1),
  (30, 2),
  (46, 1),
  (49, 1),
  (50, 1),
  (55, 1),
  (

  (3070, 3),
  (3118, 1),
  (3119, 1),
  (3120, 1),
  (3121, 1),
  (3122, 1),
  (3123, 1),
  (3124, 1),
  (3125, 1),
  (3126, 1),
  (3127, 1),
  (3128, 1)],
 [(4, 7),
  (42, 1),
  (62, 1),
  (99, 2),
  (160, 5),
  (180, 1),
  (208, 1),
  (221, 1),
  (335, 2),
  (360, 2),
  (473, 2),
  (503, 6),
  (612, 2),
  (623, 1),
  (666, 1),
  (802, 1),
  (991, 1),
  (1305, 1),
  (1749, 2),
  (2130, 1),
  (2317, 1),
  (2392, 1),
  (2500, 1),
  (3129, 1),
  (3130, 1),
  (3131, 1),
  (3132, 5)],
 [(20, 1),
  (29, 1),
  (42, 1),
  (43, 2),
  (50, 1),
  (59, 7),
  (64, 1),
  (70, 1),
  (74, 1),
  (95, 4),
  (111, 3),
  (123, 1),
  (131, 1),
  (133, 2),
  (140, 1),
  (141, 1),
  (150, 1),
  (153, 1),
  (157, 3),
  (158, 1),
  (172, 1),
  (182, 2),
  (184, 1),
  (186, 2),
  (198, 2),
  (199, 1),
  (213, 1),
  (215, 1),
  (243, 1),
  (249, 1),
  (252, 1),
  (265, 1),
  (267, 2),
  (270, 1),
  (283, 1),
  (300, 2),
  (315, 1),
  (325, 1),
  (350, 2),
  (388, 2),
  (389, 1),
  (406, 1),
  (412, 2),
  (420,

TODO: You can furthur filter and clean your data by using functions such as filter_extremes (remove all tokens that are less frequent or more frequent than a number), filter_n_most_frequent(filter out the ‘remove_n’ most frequent tokens), merge_with (to merge multiple dictionaries)

In [None]:
for doc in corpus_tfidf:
    print(doc)

## Topics

In [None]:
lsi_tfidf.print_topics()

In [None]:
lsi_bow.print_topics()

In [None]:
lda_model.print_topics()

## Plot

In [None]:
def plot_2d_space(corpus, method, use_tsne=False):

  if isinstance(method, models.ldamodel.LdaModel):
    documents_2d_1=[x[0][0][1] for x in method[corpus] if x]
    documents_2d_2=[x[0][1][1] for x in list(method[corpus]) if x]
  else:
    documents_2d_1=[x[0][1] for x in method[corpus] if x]
    documents_2d_2=[x[1][1] for x in list(method[corpus]) if x]


  fig, ax = plt.subplots(figsize=(10,10))

  # Get topic weights
  topic_weights = []
  for i, row_list in enumerate(method[corpus]):
    if row_list:
      if isinstance(method, models.ldamodel.LdaModel):
        topic_weights.append([w for i, w in row_list[0]])
      else:
        topic_weights.append([w for i, w in row_list])

  # Array of topic weights    
  arr = pd.DataFrame(topic_weights).fillna(0).values

  # Dominant topic number in each doc
  topic_num = np.argmax(arr, axis=1)

  if use_tsne:
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99)
    tsne = tsne_model.fit_transform(arr)
    documents_2d_1 = tsne[:,0]
    documents_2d_2 = tsne[:,1]

  ax.scatter(documents_2d_1, documents_2d_2, c=topic_num, s=80 ,alpha=0.8)
  for i in range(len(documents_2d_1)):
      ax.annotate(i, (documents_2d_1[i], documents_2d_2[i]))

In [None]:
plot_2d_space(BoW_corpus, lsi_bow)

In [None]:
plot_2d_space(corpus_tfidf, lsi_tfidf)

In [None]:
plot_2d_space(corpus_tfidf, lda_model)

In [None]:
plot_2d_space(corpus_tfidf, lda_model, use_tsne=True)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary=lda_model.id2word, mds='mmds')

In [None]:
vis