In [1]:
import json
import re
import gensim
import spacy
import tqdm
import logging

# Visualize LDA
import pyLDAvis.gensim
import pickle 
import pyLDAvis

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import gensim.corpora as corpora
import multiprocessing as mp

from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from pprint import pprint

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


In [2]:
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO)
cores = mp.cpu_count()

In [3]:
depth = 12
df = pd.read_json(f"../data/wikipedia_depth_{depth}.json").rename(columns={'Category': 'Path'})
df['Category'] = df['Path'].apply(lambda x: x[-1])

In [4]:
df.head()

Unnamed: 0,Title,Path,Links,Text,Category
0,Climate emergency declaration,[Global_warming],"[Aachen, Acri, Adam Bandt, Adelaide, Alexandri...",A climate emergency declaration or declaring a...,Global_warming
1,Global warming,[Global_warming],"[Abrupt climate change, Absorption (electromag...",Global warming is the mainly human-caused rise...,Global_warming
2,Shared Socioeconomic Pathways,[Global_warming],"[Abrupt climate change, Albedo, Anoxic event, ...",Shared Socioeconomic Pathways (SSPs) are scena...,Global_warming
3,Glossary of climate change,[Global_warming],"[100,000-year problem, 20th parallel north, 37...",This glossary of climate change is a list of d...,Global_warming
4,Index of climate change articles,[Global_warming],"[100% renewable energy, 100,000-year problem, ...",This is a list of climate change topics.\n\n0-...,Global_warming


# Data cleaning and preparation

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]

df_unique = df.groupby('Title').agg(list).reset_index()
df_unique['Links'] = df_unique['Links'].apply(flatten).apply(np.unique)
df_unique['Text'] = df_unique['Text'].apply(lambda x: list(set(x))[0])

In [6]:
df_unique.head()

Unnamed: 0,Title,Path,Links,Text,Category
0,"1,1,1,2-Tetrafluoroethane","[[Global_warming, Greenhouse gases]]","[(-)-5-(1,3-Dimethylbutyl)-5-ethylbarbituric a...","1,1,1,2-Tetrafluoroethane (also known as norfl...",[Greenhouse gases]
1,"1,1,1-Trichloro-2,2,2-trifluoroethane","[[Global_warming, Greenhouse gases]]","[1,1,1-Trichloro-2,2,2-trifluoroethane (data p...","Trichlorotrifluoroethane, also called 1,1,1-Tr...",[Greenhouse gases]
2,"1,1,1-Trichloroethane","[[Global_warming, Greenhouse gases]]","[(-)-5-(1,3-Dimethylbutyl)-5-ethylbarbituric a...","The organic compound 1,1,1-trichloroethane, al...",[Greenhouse gases]
3,"1,1,2-Trichloro-1,2,2-trifluoroethane","[[Global_warming, Greenhouse gases]]","[1,1,1-Trichloro-2,2,2-trifluoroethane, 1,1,1-...","1,1,2-Trichloro-1,2,2-trifluoroethane, also ca...",[Greenhouse gases]
4,"1,1-Dichloro-1-fluoroethane","[[Global_warming, Greenhouse gases]]","[1,1-Dichloro-1-fluoroethane (data page), Aque...","1,1-Dichloro-1-fluoroethane is a haloalkane wi...",[Greenhouse gases]


In [7]:
print("Number of unique articles:", len(df_unique))

Number of unique articles: 25593


In [8]:
subset = ['Methane', 'Extinction Rebellion', 'Fuel taxes', 'Hydraulic fracturing', 'Exxonmobil', 'Gazprom', 
          'Self-sustainability', 'Industrial ecology', 'Ecovillages', 'Eco-towns', 'Wildlife smuggling', 
          'Urban forestry', 'Biofuels', 'Sustainable gardening', 'Animal waste products', 'Oil platform disasters',
          'Coal phase-out', 'Climate change denial', 'Building energy rating', 'Active fire protection', 
          'Industrial minerals', 'Composting', 'Reforestation']
subset = [w.lower() for w in subset]

In [9]:
df_subset = df[df['Category'].map(lambda x: x.lower()).isin(subset)].reset_index().drop(columns=['index'])
df_subset.shape

(937, 5)

In [10]:
df_subset.head()

Unnamed: 0,Title,Path,Links,Text,Category
0,Climate change denial,"[Global_warming, Climate change, Climate chang...","[2010 Russian wildfires, 2015 United Nations C...","Climate change denial, or global warming denia...",Climate change denial
1,Khabibullo Abdussamatov,"[Global_warming, Climate change, Climate chang...","[Astrophysics, Bibcode, Cambridge University P...",Habibullo Ismailovich Abdussamatov (Russian: Х...,Climate change denial
2,Accuracy in Media,"[Global_warming, Climate change, Climate chang...","[2012 Benghazi attack, Advocacy journalism, Ag...",Accuracy in Media (AIM) is an American non-pro...,Climate change denial
3,Robert Aderholt,"[Global_warming, Climate change, Climate chang...","[105th United States Congress, 106th United St...","Robert Brown Aderholt (born July 22, 1965) is ...",Climate change denial
4,Jerry Agar,"[Global_warming, Climate change, Climate chang...","[CFRB, Chicago, Disc jockey, G. Gordon Liddy, ...",Jerry Agar is a conservative talk radio person...,Climate change denial


In [11]:
df_subset['Text'] = df_subset['Text'].apply(lambda x: re.sub('\n', '. ', x))

In [12]:
df_subset['Text'].head()

0    Climate change denial, or global warming denia...
1    Habibullo Ismailovich Abdussamatov (Russian: Х...
2    Accuracy in Media (AIM) is an American non-pro...
3    Robert Brown Aderholt (born July 22, 1965) is ...
4    Jerry Agar is a conservative talk radio person...
Name: Text, dtype: object

### Remove punctuation and lowercase

In [13]:
# Remove punctuation
df_subset['Text_processed'] = df_subset['Text'].map(lambda x: re.sub('[,\.!?]', '', x))
df_subset['Text_processed'] = df_subset['Text'].map(lambda x: x.lower())
df_subset['Text_processed'].head()

  df_subset['Text_processed'] = df_subset['Text'].map(lambda x: re.sub('[,\.!?]', '', x))


0    climate change denial, or global warming denia...
1    habibullo ismailovich abdussamatov (russian: х...
2    accuracy in media (aim) is an american non-pro...
3    robert brown aderholt (born july 22, 1965) is ...
4    jerry agar is a conservative talk radio person...
Name: Text_processed, dtype: object

### Tokenize words and further clean-up text

In [14]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = df_subset.Text_processed.values.tolist()
data_words = list(sent_to_words(data))
# print(data_words[:1])

### Phrase Modeling: Bi-grams and Tri-grams

In [15]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

INFO - 12:26:41: collecting all words and their counts
INFO - 12:26:41: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 12:26:43: collected 671735 word types from a corpus of 1609166 words (unigram + bigrams) and 937 sentences
INFO - 12:26:43: using 671735 counts as vocab in Phrases<0 vocab, min_count=5, threshold=50, max_vocab_size=40000000>
INFO - 12:26:43: collecting all words and their counts
INFO - 12:26:43: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 12:26:50: collected 696873 word types from a corpus of 1535548 words (unigram + bigrams) and 937 sentences
INFO - 12:26:50: using 696873 counts as vocab in Phrases<0 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
INFO - 12:26:50: source_vocab length 671735
INFO - 12:26:55: Phraser built with 3579 phrasegrams
INFO - 12:26:55: source_vocab length 696873
INFO - 12:27:00: Phraser built with 5708 phrasegrams


### Remove Stopwords, Make Bigrams and Lemmatize

In [16]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        if len(allowed_postags) < 1:
            texts_out.append([token.lemma_ for token in doc])
        else:
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [17]:
data_words_nostops = remove_stopwords(data_words)
data_words_trigrams = make_trigrams(data_words_nostops)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
#print(data_lemmatized[:1])

In [22]:
print(len(data_lemmatized[1]))

171


### Data transformation: Corpus and Dictionary

In [47]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
# print(corpus[:1])

# LDA

## Baseline

In [22]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=50, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       workers=cores-1,
                                       per_word_topics=True)

INFO - 00:42:33: using symmetric alpha at 0.02
INFO - 00:42:33: using symmetric eta at 0.02
INFO - 00:42:33: using serial LDA version on this node
INFO - 00:42:33: running online LDA training, 50 topics, 10 passes over the supplied corpus of 937 documents, updating every 1500 documents, evaluating every ~937 documents, iterating 50x with a convergence threshold of 0.001000
INFO - 00:42:33: training LDA model using 15 processes
INFO - 00:42:33: PROGRESS: pass 0, dispatched chunk #0 = documents up to #100/937, outstanding queue size 1
INFO - 00:42:33: PROGRESS: pass 0, dispatched chunk #1 = documents up to #200/937, outstanding queue size 2
INFO - 00:42:33: PROGRESS: pass 0, dispatched chunk #2 = documents up to #300/937, outstanding queue size 3
INFO - 00:42:34: PROGRESS: pass 0, dispatched chunk #3 = documents up to #400/937, outstanding queue size 4
INFO - 00:42:34: PROGRESS: pass 0, dispatched chunk #4 = documents up to #500/937, outstanding queue size 5
INFO - 00:42:34: PROGRESS: pa

INFO - 00:42:50: topic #47 (0.020): 0.009*"community" + 0.008*"member" + 0.007*"also" + 0.005*"include" + 0.005*"ecovillage" + 0.005*"first" + 0.004*"people" + 0.004*"may" + 0.003*"home" + 0.003*"would"
INFO - 00:42:50: topic #14 (0.020): 0.010*"building" + 0.009*"asbestos" + 0.009*"use" + 0.006*"include" + 0.006*"design" + 0.006*"also" + 0.005*"build" + 0.005*"sustainable" + 0.005*"system" + 0.004*"project"
INFO - 00:42:50: topic #26 (0.020): 0.007*"use" + 0.006*"tree" + 0.006*"also" + 0.006*"plant" + 0.005*"water" + 0.005*"compost" + 0.004*"limestone" + 0.004*"granite" + 0.004*"include" + 0.004*"project"
INFO - 00:42:50: topic #6 (0.020): 0.008*"use" + 0.006*"also" + 0.004*"project" + 0.004*"system" + 0.004*"community" + 0.004*"building" + 0.004*"say" + 0.004*"fluorite" + 0.004*"work" + 0.003*"vote"
INFO - 00:42:50: topic #38 (0.020): 0.010*"water" + 0.010*"use" + 0.009*"fire" + 0.006*"well" + 0.006*"gas" + 0.005*"system" + 0.005*"also" + 0.005*"include" + 0.005*"pressure" + 0.004*"s

INFO - 00:43:02: PROGRESS: pass 7, dispatched chunk #3 = documents up to #400/937, outstanding queue size 4
INFO - 00:43:02: PROGRESS: pass 7, dispatched chunk #4 = documents up to #500/937, outstanding queue size 5
INFO - 00:43:02: PROGRESS: pass 7, dispatched chunk #5 = documents up to #600/937, outstanding queue size 6
INFO - 00:43:02: PROGRESS: pass 7, dispatched chunk #6 = documents up to #700/937, outstanding queue size 7
INFO - 00:43:02: PROGRESS: pass 7, dispatched chunk #7 = documents up to #800/937, outstanding queue size 8
INFO - 00:43:02: PROGRESS: pass 7, dispatched chunk #8 = documents up to #900/937, outstanding queue size 9
INFO - 00:43:02: PROGRESS: pass 7, dispatched chunk #9 = documents up to #937/937, outstanding queue size 10
INFO - 00:43:06: topic #47 (0.020): 0.017*"community" + 0.010*"member" + 0.009*"ecovillage" + 0.007*"also" + 0.007*"people" + 0.006*"village" + 0.006*"include" + 0.006*"arrest" + 0.005*"action" + 0.005*"activist"
INFO - 00:43:06: topic #30 (0.

In [23]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

INFO - 00:43:15: topic #37 (0.020): 0.009*"sandstone" + 0.009*"farm" + 0.007*"mineral" + 0.007*"use" + 0.006*"also" + 0.006*"include" + 0.005*"grain" + 0.004*"quartz" + 0.004*"rock" + 0.004*"form"
INFO - 00:43:15: topic #14 (0.020): 0.015*"building" + 0.013*"asbestos" + 0.010*"design" + 0.009*"use" + 0.008*"sustainable" + 0.008*"project" + 0.008*"build" + 0.007*"include" + 0.007*"site" + 0.007*"system"
INFO - 00:43:15: topic #22 (0.020): 0.007*"hide" + 0.006*"also" + 0.005*"election" + 0.005*"include" + 0.005*"government" + 0.005*"vote" + 0.004*"receive" + 0.004*"work" + 0.003*"show" + 0.003*"year"
INFO - 00:43:15: topic #1 (0.020): 0.020*"fuel" + 0.018*"increase" + 0.012*"use" + 0.010*"ivory" + 0.008*"energy" + 0.008*"consumption" + 0.007*"effect" + 0.007*"tree" + 0.007*"trade" + 0.006*"efficiency"
INFO - 00:43:15: topic #29 (0.020): 0.020*"energy" + 0.013*"economic" + 0.007*"quality" + 0.007*"ecological_economic" + 0.006*"mcconnell" + 0.006*"form" + 0.005*"member" + 0.005*"economy" +

[(37,
  '0.009*"sandstone" + 0.009*"farm" + 0.007*"mineral" + 0.007*"use" + '
  '0.006*"also" + 0.006*"include" + 0.005*"grain" + 0.004*"quartz" + '
  '0.004*"rock" + 0.004*"form"'),
 (14,
  '0.015*"building" + 0.013*"asbestos" + 0.010*"design" + 0.009*"use" + '
  '0.008*"sustainable" + 0.008*"project" + 0.008*"build" + 0.007*"include" + '
  '0.007*"site" + 0.007*"system"'),
 (22,
  '0.007*"hide" + 0.006*"also" + 0.005*"election" + 0.005*"include" + '
  '0.005*"government" + 0.005*"vote" + 0.004*"receive" + 0.004*"work" + '
  '0.003*"show" + 0.003*"year"'),
 (1,
  '0.020*"fuel" + 0.018*"increase" + 0.012*"use" + 0.010*"ivory" + '
  '0.008*"energy" + 0.008*"consumption" + 0.007*"effect" + 0.007*"tree" + '
  '0.007*"trade" + 0.006*"efficiency"'),
 (29,
  '0.020*"energy" + 0.013*"economic" + 0.007*"quality" + '
  '0.007*"ecological_economic" + 0.006*"mcconnell" + 0.006*"form" + '
  '0.005*"member" + 0.005*"economy" + 0.005*"include" + 0.004*"policy"'),
 (7,
  '0.031*"company" + 0.029*"oil

In [24]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

INFO - 00:43:15: using ParallelWordOccurrenceAccumulator(processes=15, batch_size=64) to estimate probabilities from sliding windows
INFO - 00:43:15: 1 batches submitted to accumulate stats from 64 documents (58070 virtual)
INFO - 00:43:15: 2 batches submitted to accumulate stats from 128 documents (102630 virtual)
INFO - 00:43:15: 3 batches submitted to accumulate stats from 192 documents (143284 virtual)
INFO - 00:43:15: 4 batches submitted to accumulate stats from 256 documents (192077 virtual)
INFO - 00:43:15: 5 batches submitted to accumulate stats from 320 documents (242121 virtual)
INFO - 00:43:15: 6 batches submitted to accumulate stats from 384 documents (264546 virtual)
INFO - 00:43:15: 7 batches submitted to accumulate stats from 448 documents (292443 virtual)
INFO - 00:43:15: 8 batches submitted to accumulate stats from 512 documents (332320 virtual)
INFO - 00:43:15: 9 batches submitted to accumulate stats from 576 documents (357828 virtual)
INFO - 00:43:15: 10 batches subm


Coherence Score:  0.36833137672811433


## Hyperparameter tuning

In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=k, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       alpha=a,
                                       eta=b,
                                       per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}

min_topics = 20
max_topics = 100

step_size = 10
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [corpus]
corpus_title = ['Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('data/lda_tuning_results.csv', index=False)
    pbar.close()

In [31]:
df_tuning = pd.read_csv('../data/lda_tuning_results.csv')

In [32]:
df_tuning.iloc[df_tuning['Coherence'].argmax()]

Validation_Set                Corpus
Topics                            90
Alpha             0.9099999999999999
Beta              0.9099999999999999
Coherence                   0.571275
Name: 228, dtype: object

In [None]:
test = df_tuning[df_tuning['Topics'] == 60]
test[test['Coherence'] == test['Coherence'].max()]

## Final model

In [48]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=90, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       alpha='symmetric',
                                       eta='symmetric',
                                       per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

## Visualize LDA

In [22]:
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

INFO - 11:58:00: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO - 11:58:00: NumExpr defaulting to 8 threads.


In [23]:
df_subset['LDA'] = doc_lda

In [24]:
df_subset.head()

Unnamed: 0,Title,Path,Links,Text,Category,Text_processed,LDA
0,Climate change denial,"[Global_warming, Climate change, Climate chang...","[2010 Russian wildfires, 2015 United Nations C...","Climate change denial, or global warming denia...",Climate change denial,"climate change denial, or global warming denia...","([(73, 0.977133)], [(0, [73]), (1, [73]), (2, ..."
1,Khabibullo Abdussamatov,"[Global_warming, Climate change, Climate chang...","[Astrophysics, Bibcode, Cambridge University P...",Habibullo Ismailovich Abdussamatov (Russian: Х...,Climate change denial,habibullo ismailovich abdussamatov (russian: х...,"([(71, 0.010203324), (73, 0.5817165)], [(7, [7..."
2,Accuracy in Media,"[Global_warming, Climate change, Climate chang...","[2012 Benghazi attack, Advocacy journalism, Ag...",Accuracy in Media (AIM) is an American non-pro...,Climate change denial,accuracy in media (aim) is an american non-pro...,"([(59, 0.29951504), (73, 0.5473947)], [(7, [73..."
3,Robert Aderholt,"[Global_warming, Climate change, Climate chang...","[105th United States Congress, 106th United St...","Robert Brown Aderholt (born July 22, 1965) is ...",Climate change denial,"robert brown aderholt (born july 22, 1965) is ...","([(59, 0.74965984), (71, 0.013749702), (73, 0...."
4,Jerry Agar,"[Global_warming, Climate change, Climate chang...","[CFRB, Chicago, Disc jockey, G. Gordon Liddy, ...",Jerry Agar is a conservative talk radio person...,Climate change denial,jerry agar is a conservative talk radio person...,"([(59, 0.1593177), (71, 0.02215766), (73, 0.21..."


In [25]:
df_subset['LDA'].iloc[0]

([(73, 0.977133)],
 [(0, [73]),
  (1, [73]),
  (2, [73]),
  (3, [73]),
  (4, [73]),
  (5, [73]),
  (6, [73]),
  (7, [73]),
  (8, [73]),
  (9, [73]),
  (10, [73]),
  (11, [73]),
  (12, [73]),
  (13, [73]),
  (14, [73]),
  (15, [73]),
  (16, [73]),
  (17, [73]),
  (18, [73]),
  (19, [73]),
  (20, [73]),
  (21, [73]),
  (22, [73]),
  (23, [73]),
  (24, [73]),
  (25, [73]),
  (26, [73]),
  (27, [73]),
  (28, [73]),
  (29, [73]),
  (30, [73]),
  (31, [73]),
  (32, [73]),
  (33, [73]),
  (34, [73]),
  (35, [73]),
  (36, [73]),
  (37, [73]),
  (38, [73]),
  (39, [73]),
  (40, [73]),
  (41, [73]),
  (42, [73]),
  (43, [73]),
  (44, [73]),
  (45, [73]),
  (46, [73]),
  (47, [73]),
  (48, [73]),
  (49, [73]),
  (50, [73]),
  (51, [73]),
  (52, [73]),
  (53, [73]),
  (54, [73]),
  (55, [73]),
  (56, [73]),
  (57, [73]),
  (58, [73]),
  (59, [73]),
  (60, [73]),
  (61, [73]),
  (62, [73]),
  (63, [73]),
  (64, [73]),
  (65, [73]),
  (66, [73]),
  (67, [73]),
  (68, [73]),
  (69, [73]),
  (70, [73]

In [28]:
sims = gensim.similarities.MatrixSimilarity(doc_lda)



ValueError: not enough values to unpack (expected 2, got 1)

In [27]:
df_subset['LDA'].apply(lambda x: gensim.matutils.cossim(df_subset['LDA'].iloc[0], x))

ValueError: dictionary update sequence element #0 has length 1; 2 is required