In [1]:
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = SnowballStemmer("english")
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

import pandas as pd
import numpy as np
from tqdm import tqdm

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maria.selezniova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Generate bag of word representations for groups

In [2]:
user_id_mapping = {}

with open("../csv/user_id_mapping.csv") as f:
    for line in f:
        parts = line.split(',')
        user_id_mapping[int(parts[0])] = int(parts[1])


with open('../csv/publications.csv') as fin:
    fin.readline()
    docs = []
    ids = []
    r = 0
    for line in tqdm(fin):
        parts = line.split(',')
        id_ = int(parts[0])
        
        if id_ in user_id_mapping:
            ids.append(r)
            docs.append(parts[1])
            r += 1

744515it [00:02, 366223.63it/s]


In [3]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
%%time
processed_docs = list(map(preprocess,docs))

CPU times: user 48.2 s, sys: 791 ms, total: 49 s
Wall time: 56.5 s


In [5]:
with open('paperid_text.csv', 'w') as fout:
    fout.write('paper_id;processed_docs\n')
    for id_,doc in tqdm(enumerate(processed_docs)):
        fout.write(str(id_)+';'+','.join(doc)+'\n')

231002it [00:00, 601371.58it/s]


In [6]:
processed_docs = []
with open('paperid_text.csv') as fin:
    fin.readline()
    id_ = 0
    for line in tqdm(fin):
        parts = line.split(';')
        id_in_file = int(parts[0])
        if (id_in_file != id_):
            print('wrong id', id_, id_in_file)
            break
        processed_docs.append(parts[1][:-1].split(','))
        id_ += 1

231002it [00:00, 306978.70it/s]


In [77]:
%%time
dictionary = gensim.corpora.Dictionary(processed_docs)

CPU times: user 3.55 s, sys: 40.1 ms, total: 3.59 s
Wall time: 4.21 s


In [78]:
dictionary.filter_extremes(no_below=1000, no_above=0.5, keep_n=100000)

In [79]:
%%time
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

CPU times: user 2.52 s, sys: 94.2 ms, total: 2.62 s
Wall time: 3.09 s


In [80]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [81]:
%%time
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, passes=2)

CPU times: user 1min 27s, sys: 16.9 s, total: 1min 44s
Wall time: 2min 50s


In [82]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    
    words = []
    for i, w in enumerate(topic.split('"')):
        if i%2 != 0:
            words.append(dictionary.get(int(w)))
        else:
            words.append(w)
    words = ''.join(words)
    print('Topic {}: {}'.format(idx,words))

Topic 0: 0.055*queri + 0.032*search + 0.031*process + 0.031*rank + 0.030*constraint + 0.027*estim + 0.026*graph + 0.025*data + 0.024*entiti + 0.024*function
Topic 1: 0.047*learn + 0.046*classif + 0.034*featur + 0.033*cluster + 0.032*social + 0.030*larg + 0.030*scale + 0.030*network + 0.028*effect + 0.027*task
Topic 2: 0.036*interact + 0.034*time + 0.034*generat + 0.033*model + 0.031*automat + 0.030*domain + 0.029*user + 0.028*rule + 0.027*represent + 0.026*comput
Topic 3: 0.046*mobil + 0.043*onlin + 0.040*context + 0.040*track + 0.037*awar + 0.036*activ + 0.034*person + 0.032*privaci + 0.027*group + 0.025*strategi
Topic 4: 0.066*inform + 0.059*retriev + 0.047*workshop + 0.047*proceed + 0.044*intern + 0.040*confer + 0.028*technolog + 0.027*answer + 0.026*extract + 0.026*view
Topic 5: 0.055*semant + 0.051*base + 0.045*content + 0.044*studi + 0.041*discoveri + 0.041*case + 0.039*measur + 0.036*probabilist + 0.035*agent + 0.028*topic
Topic 6: 0.070*recommend + 0.039*program + 0.034*collabo

In [83]:
BoW_vectors = {}
for p,doc in tqdm(enumerate(bow_corpus)):
    tokens = [0]*len(dictionary)
    for token,val in doc:
        tokens[token] = val
    BoW_vectors[p] = tokens

231002it [00:04, 46437.04it/s]


In [84]:
with open('groupid_paperid.csv') as fin, open('groupid_bow.csv','w') as fout:
    fin.readline()
    fout.write('group_id;bag_of_words\n')
    #group_ids = []
    for line in tqdm(fin):
            parts = line[:-1].split(';')
            g = int(parts[0])
            paper_ids_g = [int(p) for p in parts[-1].split()]
        
            tokens_g = np.array([0]*len(dictionary))
            for p in np.random.permutation(paper_ids_g)[:500]:
                tokens_g += BoW_vectors[p]
            fout.write(str(g)+';'+str([(i,val) for i,val in enumerate(tokens_g) if val>0])+'\n')

33302it [05:42, 97.21it/s] 


In [85]:
del BoW_vectors

# Generate topics from group_id_bow file

In [87]:
%%time
df = pd.read_csv('groupid_bow.csv', error_bad_lines=False, delimiter=';', skipinitialspace=True);
a = []
for i in tqdm(range(len(df['bag_of_words']))):
    b = eval(df['bag_of_words'][i])
    a = a + [b]
df = pd.DataFrame({"id": df['group_id'] , "bag_of_words":a})

100%|██████████| 33302/33302 [01:07<00:00, 490.51it/s]

CPU times: user 1min 1s, sys: 1.74 s, total: 1min 3s
Wall time: 1min 8s





In [92]:
topics = []

for i in tqdm(df['bag_of_words']):
    topics = topics + [lda_model_tfidf.get_document_topics(i)]

100%|██████████| 33302/33302 [00:57<00:00, 581.09it/s]


In [93]:
topics = pd.DataFrame({"id": df['id'] , "topics":topics})

In [94]:
topics.to_csv('../csv/topics.csv', index = False)

In [105]:
lda_model_tfidf.get_document_topics(df['bag_of_words'][6543])

[(0, 0.21478636170952467),
 (1, 0.16209490377891622),
 (2, 0.061621287828714733),
 (3, 0.082658704277063372),
 (4, 0.047637520078682709),
 (5, 0.051632862813726296),
 (6, 0.044930853403263436),
 (7, 0.091755877927359675),
 (8, 0.19462335394248678),
 (9, 0.048258274240262203)]