# Let's Plays comments topic extraction

In [1]:
import os
from collections import Counter, defaultdict
from pprint import pprint
from string import punctuation

import nltk
import numpy as np
import pandas as pd
from gensim import corpora, models
from nltk import word_tokenize
from nltk.corpus import stopwords
from scipy import spatial

## Loading data

In [2]:
%%time

df = pd.read_csv('data/cleaned_translated_comments2.csv')
df.fillna('', inplace=True)

CPU times: user 1.4 s, sys: 108 ms, total: 1.51 s
Wall time: 1.52 s


In [3]:
df.shape

(307450, 6)

In [4]:
df.head()

Unnamed: 0,channelId,community,text,language,translation,cleaned
0,UC5tp0GG3-cvlnflj5NuA9xQ,9,"i think it's the tree stump, could be a a slee...",en,"i think it's the tree stump, could be a a slee...",tree stump sleeping bird aswell
1,UC5tp0GG3-cvlnflj5NuA9xQ,9,Lothrazar. use the dream rod on the tree stump...,en,Lothrazar. use the dream rod on the tree stump...,lothrazar dream rod tree stump island deer tol...
2,UC5tp0GG3-cvlnflj5NuA9xQ,9,whoops sorry posted that half way through the ...,en,whoops sorry posted that half way through the ...,whoops sorry posted half figure
3,UC5tp0GG3-cvlnflj5NuA9xQ,9,lol kinda pointless to say so now but by the l...,en,lol kinda pointless to say so now but by the l...,kinda pointless glowing bridge turned wait sto...
4,UC5tp0GG3-cvlnflj5NuA9xQ,9,"Thanks, and it wasn't a spoiler, I usualy have...",en,"Thanks, and it wasn't a spoiler, I usualy have...",spoiler usualy recorded ahead uploaded


In [5]:
df2 = df[df.cleaned != ' ']

In [6]:
df2.shape

(295950, 6)

## Topics

In [7]:
%%time

texts_sr = df2.groupby('channelId').cleaned.sum()

CPU times: user 14.3 s, sys: 24 ms, total: 14.3 s
Wall time: 14.3 s


In [8]:
len(texts_sr)

182

In [9]:
for i, text in texts_sr.items():
    print(i, len(text.split()))

UC-Oq5kIPcYSzAwlbl9LH4tQ 638
UC-WA0qTCwuMCd418q_6xbEg 688
UC-uyvnv84IIzV3G_O4fPYgA 10219
UC08Gq2qWDvwa1uKZTVP4lAg 59368
UC0DZmkupLYwc0yDsfocLh0A 173
UC0DqUlcI5uKnkMBqqs9m5Ng 332
UC0ySlisUWtJMU6zHJSUzuVA 2619
UC1JGziY4knJ-w7BxyePqCfw 109
UC1OGV4dczjRz8wNLRaezJ5Q 226
UC1bmqfyYrLIpKEsqo2IsZOA 3172
UC1ieoHqKW-yYgDhLHIcx28w 86296
UC1sBOLUR6x_sTfaN-cvv4Og 405
UC22TOQWJue006Lp6DB5QhDA 12494
UC27iGXU0N4Nf04CgKGSGpXg 54
UC2A7bwBPDDuyI8jTVcAZqDw 135
UC2P4JJIaTh9LlV_7YCeWH_w 267
UC3LAmEliCwrdNXQIqA_NUWg 44538
UC3RduJK_c6vKtlh2o4cggSA 114
UC4DPdCyCaJ941cxbIPZfxfg 882
UC4GyPwwVPopV0qg1D_wOF_g 1835
UC4LHNX8d8RqnDX0OezgmCTg 190
UC4f1zAG2BTkfOQV4_nFbpBQ 268
UC4gVuK5z6pkivHozyLn016Q 1332
UC5tp0GG3-cvlnflj5NuA9xQ 5520
UC6C1dyHHOMVIBAze8dWfqCw 839
UC6HDPI4EsMzpUFFQTKAI_0A 837
UC6iKvdRCZjEq2LfYluavKsQ 250
UC71kKpUN9OQMl_HivC19A_g 21794
UC7H25PSfrzUf3bqHUReKYRA 901
UC7_YxT-KID8kRbqZo7MyscQ 25530
UC7eWS_9eKJDI_neh8GMVHpw 413
UC9BrZKhbxpOWMx9JuzWnYXA 287
UC9ql--q_LV7zhDI1CCO9Xdg 1511
UCAUgvzNgPUesKgDtA066zSQ

In [10]:
documents = list(texts_sr)

In [11]:
len(documents)

182

In [12]:
texts = [[word for word in document.split()] for document in documents]

In [13]:
len(texts)

182

In [14]:
def load_ini_dictionary(texts):
    filename = 'data/dictionaries/dictionary.dict'
    if os.path.exists(filename):
        dictionary = corpora.Dictionary.load(filename)
    else:
        dictionary = corpora.Dictionary(texts)
        dictionary.save(filename)
    return dictionary

In [15]:
def load_param_dictionary(dictionary, MIN_DFs, MAX_DFs, MAX_FTs):
    filename = 'data/dictionaries/dictionary_{}_{}_{}.dict'.format(MIN_DFs, MAX_DFs, MAX_FTs)
    if os.path.exists(filename):
        dictionary = corpora.Dictionary.load(filename)
    else:
        dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF, keep_n=MAX_FT, keep_tokens=['picasso'])
        dictionary.save(filename)
    return dictionary

In [16]:
dictionary = load_ini_dictionary(texts)

In [17]:
len(dictionary)

116683

In [18]:
%%time

# lsi_topics_lists = []
lda_topics_lists = []
hdp_topics_lists = []

MIN_DFs = [2,3,4]
MAX_DFs = [0.7,0.8, 0.9]
MAX_FTs = [100, 1000, 10000, len(dictionary)]
N_TOPICSs = [5, len(texts), 10, 50]
N_WORDS = 15

total_iterations = len(MIN_DFs) * len(MAX_DFs) * len(MAX_FTs) * (len(N_TOPICSs) + 1)
iteration = 1
for MIN_DF in MIN_DFs:
    for MAX_DF in MAX_DFs: 
        for MAX_FT in MAX_FTs:
            dictionary = load_ini_dictionary(texts)
            dictionary = load_param_dictionary(dictionary, MIN_DF, MAX_DF, MAX_FT)
            corpus = [dictionary.doc2bow(text) for text in texts]
            tfidf = models.TfidfModel(corpus)
            corpus_tfidf = tfidf[corpus]
            
            # HDP
            hdp = models.HdpModel(corpus, id2word=dictionary, random_state=0)
            corpus_hdp = hdp[corpus]
            ts = hdp.show_topics(-1, num_words=N_WORDS, formatted=False)
            result = (ts, (MIN_DF, MAX_DF, MAX_FT, -1), corpus_hdp)
            hdp_topics_lists.append(result)
                        
            topics_set = set(topic for doc in corpus_hdp for topic, weight in doc)
            hdp_n_topics = len(topics_set)
            
            n_topics_aux = set(N_TOPICSs).union([hdp_n_topics])
            
            for N_TOPICS in n_topics_aux:
                print('Iteration: {} / ~{}'.format(iteration, total_iterations), \
                      '- MIN_DF:', MIN_DF, '- MAX_DF:', MAX_DF, '- MAX_FT:', MAX_FT, '- N_TOPICS:', N_TOPICS)
                iteration += 1
                
                # LSI
#                 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS)
#                 corpus_lsi = lsi[corpus_tfidf]
#                 ts = lsi.show_topics(lsi.num_topics, formatted=False)
#                 result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lsi)
#                 lsi_topics_lists.append(result)
            
                # LDA
                lda = models.LdaModel(corpus, id2word=dictionary, num_topics=N_TOPICS, random_state=0)
                corpus_lda = lda[corpus]
                ts = sorted(lda.show_topics(lda.num_topics, formatted=False))
                result = (ts, (MIN_DF, MAX_DF, MAX_FT, N_TOPICS), corpus_lda)
                lda_topics_lists.append(result)

Iteration: 1 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 10
Iteration: 2 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 50
Iteration: 3 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 5
Iteration: 4 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 182
Iteration: 5 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 100 - N_TOPICS: 143
Iteration: 6 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 10
Iteration: 7 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 50
Iteration: 8 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 5
Iteration: 9 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 182
Iteration: 10 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 1000 - N_TOPICS: 98
Iteration: 11 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 10000 - N_TOPICS: 78
Iteration: 12 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 10000 - N_TOPICS: 10
Iteration: 13 / ~180 - MIN_DF: 2 - MAX_DF: 0.7 - MAX_FT: 10000 - N_TOPICS: 50
Iteration

Iteration: 107 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 1000 - N_TOPICS: 83
Iteration: 108 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 1000 - N_TOPICS: 50
Iteration: 109 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 1000 - N_TOPICS: 5
Iteration: 110 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 1000 - N_TOPICS: 182
Iteration: 111 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 10000 - N_TOPICS: 10
Iteration: 112 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 10000 - N_TOPICS: 69
Iteration: 113 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 10000 - N_TOPICS: 50
Iteration: 114 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 10000 - N_TOPICS: 5
Iteration: 115 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 10000 - N_TOPICS: 182
Iteration: 116 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 116683 - N_TOPICS: 81
Iteration: 117 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 116683 - N_TOPICS: 10
Iteration: 118 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX_FT: 116683 - N_TOPICS: 50
Iteration: 119 / ~180 - MIN_DF: 3 - MAX_DF: 0.9 - MAX

In [19]:
def extract_vocabulary(ts):
    vocab = set()
    for t in ts:
        words = [word for word, weight in t]
        vocab = vocab.union(words)
    vocab = sorted(vocab)
    r = {w: i for i, w in enumerate(vocab)}
    return r

def get_weights(t):
    return dict(t)

def create_vector(vocab, weights):
    r = [0] * len(vocab)
    for wo in weights:
        r[vocab[wo]] = weights[wo]
    return r

def vectorize(ts):
    r = []
    ts = [t for i, j, t in ts]
    vocab = extract_vocabulary(ts)
    for t in ts:
        weights = get_weights(t)
        vector = create_vector(vocab, weights)
        r.append(vector)
    return r

In [20]:
def two_topics_distance(t1, t2):
    r = spatial.distance.cosine(t1, t2)
    return r

def multiple_topics_distance(ts):
    ds = []
    for i, t1 in enumerate(ts[:-1]):
        for j in range(i + 1, len(ts)):
            t2 = ts[j]
            d = two_topics_distance(t1, t2)
            ds.append(d)
    return np.mean(ds)

def transform(ts_list): # formerly top_n_used_topics
    total_weights = defaultdict(float)
    corpus = ts_list[2]
    for doc in corpus:
        for topic, weight in doc:
            total_weights[topic] += abs(weight)
    total_weights = dict(total_weights)
    counter = Counter(total_weights)
    topics_dct = dict(ts_list[0])
    r = ([(weight, topic_id, topics_dct[topic_id])
             for topic_id, weight in counter.most_common()], ts_list[1], ts_list[2])
    return r

def best_topics(ts_lists):
    r = []
    for ts_list in ts_lists:
        top_n_ts_list = transform(ts_list)
        ts_vector = vectorize(top_n_ts_list[0])
        d = multiple_topics_distance(ts_vector)
        r.append((d, top_n_ts_list))
    r = sorted(r, reverse=True)
    return r[0]

In [21]:
%%time

# print('LSI')
# lsi_best_topics_list = best_topics(lsi_topics_lists)
print('LDA')
lda_best_topics_list = best_topics(lda_topics_lists)
print('HDP')
hdp_best_topics_list = best_topics(hdp_topics_lists)

LDA
HDP
CPU times: user 28min 53s, sys: 23min 37s, total: 52min 31s
Wall time: 7min 36s


## Results

### LDA

In [22]:
for i, doc in enumerate(lda_best_topics_list[1][2]):
    print(texts_sr.index[i], doc)

UC-Oq5kIPcYSzAwlbl9LH4tQ [(92, 0.5957218), (128, 0.39438808)]
UC-WA0qTCwuMCd418q_6xbEg [(145, 0.40724078), (155, 0.5811238)]
UC-uyvnv84IIzV3G_O4fPYgA [(24, 0.031188242), (26, 0.038827408), (53, 0.04861756), (90, 0.010706735), (95, 0.011016759), (115, 0.014485334), (117, 0.19630887), (122, 0.123360366), (143, 0.031830348), (155, 0.40334654), (169, 0.018070249), (172, 0.020772506), (179, 0.022933323)]
UC08Gq2qWDvwa1uKZTVP4lAg [(10, 0.020433195), (14, 0.02473854), (17, 0.11020548), (19, 0.2680014), (66, 0.012412146), (73, 0.011022439), (80, 0.17759606), (82, 0.028781598), (126, 0.23051715), (143, 0.015447361), (146, 0.023593467), (166, 0.012365526)]
UC0DZmkupLYwc0yDsfocLh0A [(36, 0.18228184), (106, 0.7858146)]
UC0DqUlcI5uKnkMBqqs9m5Ng [(126, 0.982241)]
UC0ySlisUWtJMU6zHJSUzuVA [(34, 0.02654759), (40, 0.025546454), (59, 0.034915056), (60, 0.042367257), (62, 0.036420178), (90, 0.1654574), (95, 0.07842665), (97, 0.049480945), (99, 0.08610085), (106, 0.0427337), (114, 0.018886968), (125, 0.04

UCIaIBnLtbXIVD4w8_n2R9xA [(95, 0.25263247), (154, 0.09899464), (167, 0.6311182)]
UCItYlIgFQJpZ1pUG0dVwCIw [(51, 0.98599285)]
UCIzTwHf2bRCY42aYG3S9jVQ [(95, 0.9825525)]
UCJ2ZDzMRgSrxmwphstrm8Ww [(34, 0.7857157), (44, 0.016463263), (47, 0.01076211), (70, 0.02332453), (109, 0.03579177), (118, 0.011505791)]
UCJah7iJ1jLIhqCmgDkTHS_A [(149, 0.9289639)]
UCK3kaNXbB57CLcyhtccV_yw [(8, 0.027813958), (25, 0.039747167), (59, 0.011891417), (62, 0.3320447), (79, 0.2785878), (92, 0.010934214), (95, 0.017303172), (106, 0.13382812), (108, 0.010280219), (154, 0.010329476), (160, 0.026342075), (172, 0.08968725)]
UCKlhpmbHGxBE6uw9B_uLeqQ [(2, 0.012791575), (27, 0.058072772), (34, 0.08908822), (38, 0.015233158), (47, 0.02598875), (49, 0.07028875), (56, 0.05982563), (59, 0.21109802), (60, 0.055246074), (62, 0.027121088), (70, 0.023313353), (71, 0.010567971), (79, 0.038377214), (91, 0.010362103), (102, 0.05517106), (109, 0.08846488), (116, 0.018386712), (123, 0.011454483)]
UCKrFXqpQj3gM98LF22Yq8Kg [(56, 0.06

UCfWhXCFkZ0x18xRfsbYPtog [(41, 0.9526421)]
UCfZFwPGOpbWk0z0Fk7tU6yA [(47, 0.017862406), (78, 0.010189893), (129, 0.03284772), (151, 0.8592249), (166, 0.038567465), (173, 0.030127544)]
UCfgZQq_S_PP1CFZgyK40viQ [(27, 0.079772055), (75, 0.33829457), (102, 0.2123757), (141, 0.024814261), (169, 0.14166602), (174, 0.18666203)]
UCfznY5SlSoZoXN0-kBPtCdg [(11, 0.55450976), (37, 0.041028123), (124, 0.16795798), (126, 0.13544592), (128, 0.069077365), (131, 0.02748148)]
UCg-Vit0kkfLl_46UcQS5N2g [(24, 0.92896384)]
UCgSFaz1ARqNiA20jVKRfCgg [(24, 0.016711846), (25, 0.15676971), (59, 0.023962453), (106, 0.013442464), (117, 0.19967766), (172, 0.45957282), (179, 0.1263267)]
UCgSHGbs2oGoLItc-8y5hJ9g [(1, 0.06918771), (20, 0.063102625), (27, 0.024712708), (72, 0.5585956), (98, 0.046606883), (118, 0.08828287), (130, 0.04676359), (141, 0.08922847)]
UCgnvOjPGMjBWUVWq9M5P5eA [(30, 0.57815564), (155, 0.39148074), (181, 0.017250132)]
UCgs2lfXSk6y55R0dJVKxmxA [(63, 0.9723748)]
UChAwghFwLpEhld1DeeRA-vA [(179, 0.9

In [23]:
lda_best_topics_list[1][1]

(4, 0.7, 100, 182)

In [24]:
lda_best_topics_list[1][0]

[(8.310153504833579,
  155,
  [('always', 0.028499505),
   ('world', 0.024148675),
   ('every', 0.019753363),
   ('dude', 0.018133733),
   ('anyone', 0.017570974),
   ('times', 0.016811581),
   ('bro', 0.016574765),
   ('long', 0.016462369),
   ('since', 0.016262962),
   ('someone', 0.016073354)]),
 (7.262525064870715,
  102,
  [('ago', 0.037285734),
   ('years', 0.030609077),
   ('series', 0.02663901),
   ('anyone', 0.021831349),
   ('voice', 0.021293689),
   ('every', 0.020553539),
   ('ps', 0.020055866),
   ('find', 0.01834371),
   ('guy', 0.017486228),
   ('remember', 0.01730614)]),
 (5.201197263784707,
  126,
  [('series', 0.060181938),
   ('guy', 0.028669614),
   ('ps', 0.022914188),
   ('someone', 0.02270289),
   ('last', 0.020215843),
   ('fucking', 0.019915465),
   ('little', 0.01943092),
   ('black', 0.01935934),
   ('kill', 0.019127533),
   ('without', 0.01772512)]),
 (5.123905337415636,
  24,
  [('ps', 0.033788748),
   ('always', 0.024286395),
   ('guys', 0.023067808),
   (

### HDP

In [25]:
for i, doc in enumerate(hdp_best_topics_list[1][2]):
    print(texts_sr.index[i], doc)
#     print(doc)

UC-Oq5kIPcYSzAwlbl9LH4tQ [(5, 0.9159547750121582), (39, 0.0808541904440573)]
UC-WA0qTCwuMCd418q_6xbEg [(3, 0.05714339005495345), (4, 0.23144399565057616), (5, 0.07073161743985955), (6, 0.15976494088740148), (8, 0.15985308827147307), (62, 0.3186428668865656)]
UC-uyvnv84IIzV3G_O4fPYgA [(0, 0.13299586103697766), (1, 0.3556859688142117), (3, 0.4680789386539054), (8, 0.013979742540079717), (42, 0.01973621905487818)]
UC08Gq2qWDvwa1uKZTVP4lAg [(0, 0.05667048446727923), (2, 0.8007856361424561), (3, 0.03891796357923592), (5, 0.03708970023962515), (9, 0.021967348505399147), (12, 0.018986399746061096), (23, 0.018065346974975968)]
UC0DZmkupLYwc0yDsfocLh0A [(2, 0.8452841556673296), (8, 0.13840199431552608)]
UC0DqUlcI5uKnkMBqqs9m5Ng [(0, 0.20391397507293874), (1, 0.5401956926326967), (111, 0.2515543881517591)]
UC0ySlisUWtJMU6zHJSUzuVA [(0, 0.16164480741802184), (1, 0.6978356956493537), (8, 0.11373935444525529), (30, 0.02621463027554831)]
UC1JGziY4knJ-w7BxyePqCfw [(6, 0.9798052588873555)]
UC1OGV4dczj

UCLCmJiSbIoa_ZFiBOBDf6ZA [(1, 0.22494632218790023), (2, 0.11294334124885883), (6, 0.5918121671155304), (7, 0.05948525501538704)]
UCMR4c29nBjseYCz2IxrIC4Q [(2, 0.7104362878362286), (7, 0.2880857283454767)]
UCMWHhRY7Iuxwm1hF1Fv-btw [(0, 0.04384106277978634), (1, 0.3659523667753076), (2, 0.06359653262875513), (7, 0.525668359329434)]
UCMdbHJWpwR9FMsfcQDnCcSA [(0, 0.09020682984423581), (1, 0.06120364931332294), (3, 0.16499605811998744), (6, 0.025513013317679068), (7, 0.15352459883636227), (9, 0.4886390992891642)]
UCN-v-Xn9S7oYk0X2v1jx1Qg [(2, 0.04256185378309892), (3, 0.5547414088940061), (17, 0.4015742783391738)]
UCNOlCuq2WFe1juN39kwnYGw [(0, 0.05285150517983061), (1, 0.5162861071120612), (2, 0.020643473585672763), (6, 0.08309284658298556), (8, 0.31613464884443004)]
UCNuXTAYi1m4x_EszsSw21Nw [(0, 0.6115801851404279), (5, 0.061714449303663786), (6, 0.11970882946900414), (7, 0.03295208104744297), (12, 0.14206155875315493), (59, 0.0315446278086788)]
UCOgaIuQYGr6ow_jbote4BKA [(2, 0.383171928509

UCit6YyWQKcvvJMZJiNA8giw [(0, 0.0983137643316698), (1, 0.10355428194074809), (2, 0.13572209424437948), (7, 0.03955998127482012), (8, 0.010972511317474449), (16, 0.6054886535227941)]
UCj5i58mCkAREDqFWlhaQbOw [(1, 0.23898605313281068), (2, 0.36245745680320224), (3, 0.12002734323335915), (4, 0.024814470465764765), (8, 0.2356348545854764), (9, 0.017829840683465478)]
UCj6XZFBQIua4K1X1YsHkYTg [(6, 0.7789736113250647), (27, 0.20107103648839159)]
UCk4mCvmZcXB4DqlcVpI2hpg [(0, 0.9089046282957376), (5, 0.02883285797551134), (11, 0.023439355520054037), (73, 0.021274905649447246)]
UCke6I9N4KfC968-yRcd5YRg [(1, 0.48709578783038704), (2, 0.0821871825083791), (3, 0.4305780375489706)]
UClJlcmhUkErNMYGAbzoGd8A [(4, 0.9983534769319045)]
UCm1F9GekuQVNf93XU7-i9kw [(2, 0.23674756429458732), (6, 0.5740299199817807), (8, 0.18710265827578462)]
UCmSwqv2aPbuOGiuii2TeaLQ [(2, 0.015511327849414716), (3, 0.5134207814291294), (4, 0.0608199781736121), (5, 0.40123322431147357)]
UCmp5y07YIV6i2jPX4x82hVQ [(0, 0.0168202

In [26]:
hdp_best_topics_list[1][1]

(4, 0.7, 1000, -1)

In [27]:
# (4, 0.75, 1000, -1)

In [28]:
len(hdp_best_topics_list[1][0])

98

In [29]:
hdp_best_topics_list[1][0]

[(30.433812037279814,
  1,
  [('jack', 0.01726522581452443),
   ('guys', 0.006055236248557141),
   ('anyone', 0.005840948708681616),
   ('series', 0.005756059957338076),
   ('else', 0.005684474045451678),
   ('sky', 0.005520795623085178),
   ('funny', 0.005008608792183373),
   ('guy', 0.004840498066963087),
   ('day', 0.004495885579409968),
   ('every', 0.0044348279102708905),
   ('always', 0.0042942078627986975),
   ('dude', 0.0041799142027722875),
   ('life', 0.004115819477196298),
   ('voice', 0.003990045934080139),
   ('someone', 0.003810169087559524)]),
 (22.48618943855038,
  0,
  [('always', 0.012904607647303662),
   ('let', 0.00938158263914936),
   ('episode', 0.007849074755006161),
   ('find', 0.006683626686459636),
   ('fun', 0.005916524501436917),
   ('ps', 0.0056418352152068875),
   ('little', 0.005560832591591013),
   ('cod', 0.005248172700101099),
   ('everything', 0.005205897782068366),
   ('part', 0.005135178754285774),
   ('voice', 0.005043744216377767),
   ('since', 0.

### Channel grouping

In [30]:
# ch=df2.groupby('community').channelId

In [31]:
# channel_bycomm=[]
# total=0

# for i, chan in ch:
#     channels=set()
#     channels=channels.union(chan)
#     total+=len(channels)
#     x=(i,channels)
#     channel_bycomm.append(x)
# total
# #     print(len(channels.union(chan)))

In [32]:
# topics_grouped=[]

# for comm in channel_bycomm:
#     all_topics=[]
#     print(comm[0])
#     print(len(comm[1]))
#     for i, doc in enumerate(hdp_best_topics_list[1][2]):
#         if texts_sr.index[i] in comm[1]:
#             all_topics=all_topics+doc
#     print( sorted(all_topics, key=lambda tup: tup[0]))

# #             comm_temp=comm+(doc,)
# #             topics_grouped.append(comm_temp)
# #             print(topics_grouped)

In [33]:
# # record comments_by_language
# lan=df.groupby('community')
# for i, lang in lan:
#     lan_list=[]
#     print(str(i))
#     for l in lang.language:
#         lan_list.append(l)
#     counter=Counter(lan_list)
#     print(counter.most_common())
#     print(len(lan_list))
#     print('')

In [34]:
# 9
# [('en', 49856), ('de', 276), ('it', 97), ('es', 76), ('fr', 52), ('pt', 46), ('ru', 27), ('tr', 16), ('nl', 8)]
# 50454

# 22
# [('en', 50709), ('pt', 7963), ('it', 1222), ('de', 569), ('es', 368), ('fr', 354), ('ru', 257), ('nl', 76), ('tr', 36)]
# 61554

# 37
# [('en', 34778), ('fr', 185), ('de', 109), ('es', 85), ('pt', 71), ('it', 50), ('ru', 33), ('tr', 2)]
# 35313

# 71
# [('en', 37471), ('es', 5648), ('de', 2398), ('fr', 659), ('pt', 572), ('it', 124), ('ru', 111), ('tr', 23), ('nl', 11)]
# 47017

# 76
# [('es', 11155), ('en', 8493), ('pt', 131), ('it', 124), ('fr', 55), ('ru', 28), ('de', 20), ('tr', 16), ('nl', 1)]
# 20023

# 85
# [('de', 35865), ('es', 9491), ('en', 8818), ('pt', 2157), ('fr', 1049), ('it', 835), ('ru', 407), ('nl', 66), ('tr', 26)]
# 58714

# 87
# [('ru', 34050), ('en', 285), ('es', 22), ('de', 7), ('pt', 4), ('fr', 4), ('it', 3)]
# 34375