# <center>Generative Probabilistic Models for collections of text corpora: Latent Dirichlet Allocation on the Reuters dataset.</center>

<center>Eloi Garcia Climent</center>

<center>Barcelona School of Economics</center>

## Importing the Required Libraries

In [None]:
import numpy as np
import pandas as pd
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy

#vis
import pyLDAvis

#nltk
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Preparing the Data



1.   Load the dataset



In [None]:
reuters = pd.read_csv("reutersCSV.csv",encoding= 'unicode_escape')



2.   Punctuation and stopwords download



In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stopwords = stopwords.words("english")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eloid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eloid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


3. Cleaning and tokenizing the data

In [None]:
reuters = reuters.dropna()
reuters['doc.text'] = reuters['doc.text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
reuters["doc.text"] = reuters["doc.text"].str.lower()
reuters["doc.text"] = reuters["doc.text"].str.replace('[^\w\s]','')
reuters["doc.text.token"] = reuters["doc.text"].apply(nltk.word_tokenize)
reuters["doc.text"][1]

  reuters["doc.text"] = reuters["doc.text"].str.replace('[^\w\s]','')


'standard oil co bp north america inc said plan form venture manage money market borrowing investment activities companies bp north america subsidiary british petroleum co plc ltbp also owns 55 pct interest standard oil the venture called bpstandard financial trading operated standard oil oversight joint management committee reuter'

4. Topic filtering (We're going to classify the 30 most common topics)

In [None]:
import math
#Getting 30 most common topics
reuters.loc['total'] = reuters.select_dtypes(pd.np.number).sum()

#Remove news from non-common topics
common_topics = reuters.iloc[-1].sort_values(ascending=False)[2:31].index
filter = np.any(reuters[common_topics]>0,axis=1)
reuters = reuters[filter]
reuters = reuters.drop(index="total")

#remove topics columns
reuters.loc['total'] = reuters.select_dtypes(pd.np.number).sum()
reuters = reuters.loc[:, (reuters!=0).any(0)]
#reuters = reuters.drop(index="total")
part1 = reuters.iloc[:,np.array(reuters.loc["total"]>66)]
part2 = reuters.iloc[:,np.array(reuters.loc["total"]) != np.array(reuters.loc["total"])]
reuters = pd.merge(part1, part2, left_index=True, right_index=True)
reuters = reuters.drop(index="total")

#Everything in one column
reuters["real_topic"] = 0
reuters = reuters.drop(["pid","fileName"],axis=1)
count = 1
for topic in common_topics:
    #reuters["real_topic"] = [count if x == 1 else reuters["real_topic"] for x in reuters[topic]]
    reuters[topic] = reuters[topic].replace(1.0,count)
    count = count +1
reuters
reuters["real_topic"]= reuters.max(axis=1)
reuters["predicted_topic"] = 0
reuters = reuters.drop(common_topics,axis=1)
reuters

  reuters.loc['total'] = reuters.select_dtypes(pd.np.number).sum()
  reuters.loc['total'] = reuters.select_dtypes(pd.np.number).sum()
  reuters["real_topic"]= reuters.max(axis=1)


Unnamed: 0,purpose,doc.title,doc.text,doc.text.token,real_topic,predicted_topic
0,train,BAHIA COCOA REVIEW,showers continued throughout week bahia cocoa ...,"[showers, continued, throughout, week, bahia, ...",28.0,0
4,train,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,the us agriculture department reported farmero...,"[the, us, agriculture, department, reported, f...",10.0,0
5,train,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS,argentine grain board figures show crop regist...,"[argentine, grain, board, figures, show, crop,...",21.0,0
8,train,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT,champion products inc said board directors app...,"[champion, products, inc, said, board, directo...",1.0,0
9,train,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,computer terminal systems inc said completed s...,"[computer, terminal, systems, inc, said, compl...",2.0,0
...,...,...,...,...,...,...
21569,test,THAI RICE EXPORTS RISE IN WEEK TO OCTOBER 13,thai rice exports rose 72987 tonnes week ended...,"[thai, rice, exports, rose, 72987, tonnes, wee...",29.0,0
21570,test,N.Z.'S CHASE CORP MAKES OFFER FOR ENTREGROWTH,chase corp ltd ltchcawe said make offer fullyp...,"[chase, corp, ltd, ltchcawe, said, make, offer...",2.0,0
21572,test,TOKYO DEALERS SEE DOLLAR POISED TO BREACH 140 YEN,tokyos foreign exchange market watching nervou...,"[tokyos, foreign, exchange, market, watching, ...",13.0,0
21573,test,JAPAN/INDIA CONFERENCE CUTS GULF WAR RISK CHARGES,the japanindiapakistangulfjapan shipping confe...,"[the, japanindiapakistangulfjapan, shipping, c...",8.0,0


5. Obtaining the words that will be included in the model

In [None]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(reuters[reuters["purpose"]=="train"]["doc.text"])

print (data_words[0][0:20])

['showers', 'continued', 'throughout', 'week', 'bahia', 'cocoa', 'zone', 'alleviating', 'drought', 'since', 'early', 'january', 'improving', 'prospects', 'coming', 'temporao', 'although', 'normal', 'humidity', 'levels']


6. Generating bigrams and trigrams

In [None]:
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_and_trigrams = make_trigrams(data_bigrams)

print(data_bigrams_and_trigrams[0])

['showers', 'continued', 'throughout', 'week', 'bahia', 'cocoa', 'zone', 'alleviating', 'drought', 'since', 'early', 'january', 'improving', 'prospects', 'coming', 'temporao', 'although', 'normal', 'humidity', 'levels', 'restored', 'comissaria', 'smith', 'said', 'weekly', 'review', 'the', 'dry', 'period', 'means', 'temporao', 'late', 'year', 'arrivals', 'week', 'ended', 'february', 'bags', 'kilos', 'making', 'cumulative', 'total', 'season', 'mln', 'stage', 'last', 'year', 'again', 'seems', 'cocoa', 'delivered', 'earlier', 'consignment', 'included', 'arrivals', 'figures', 'comissaria', 'smith', 'said', 'still', 'doubt', 'much', 'old', 'crop', 'cocoa', 'still', 'available', 'harvesting', 'practically', 'come', 'end', 'with', 'total', 'bahia', 'crop', 'estimates', 'around', 'mln', 'bags', 'sales', 'standing', 'almost', 'mln', 'hundred', 'thousand', 'bags', 'still', 'hands', 'farmers', 'middlemen', 'exporters', 'processors', 'there', 'doubts', 'much', 'cocoa', 'would', 'fit', 'export', 'sh

7. Processing the low-incidence words (below 3%) with a Term frequency - Inverse Document Frequency model and generating the train corpus.

In [None]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_and_trigrams)
texts = data_bigrams_and_trigrams

corpus=[id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow


In [None]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 3), (13, 1), (14, 5), (15, 4), (16, 2), (17, 1), (18, 1), (19, 2)]
again


# LDA Model

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")


## Model topics generated

In [None]:
lda_model.print_topics(30,10)

[(0,
  '0.220*"billion" + 0.082*"february" + 0.051*"rose" + 0.050*"currency" + 0.047*"january" + 0.028*"surplus" + 0.022*"the" + 0.020*"figures" + 0.020*"account" + 0.019*"revised"'),
 (1,
  '0.255*"sugar" + 0.089*"short" + 0.079*"tons" + 0.034*"appreciation" + 0.029*"drilling" + 0.025*"television" + 0.024*"cited" + 0.017*"tighten" + 0.015*"former" + 0.014*"economics"'),
 (2,
  '0.122*"production" + 0.119*"prices" + 0.069*"june" + 0.056*"petroleum" + 0.049*"price" + 0.047*"bpd" + 0.034*"energy" + 0.028*"lbs" + 0.027*"barley" + 0.026*"output"'),
 (3,
  '0.102*"share" + 0.063*"per" + 0.062*"dividend" + 0.053*"cts" + 0.044*"nil" + 0.042*"stock" + 0.042*"income" + 0.037*"april" + 0.036*"tax" + 0.030*"gains"'),
 (4,
  '0.060*"department" + 0.053*"may" + 0.048*"tonnes" + 0.048*"us" + 0.047*"export" + 0.044*"corn" + 0.038*"the" + 0.037*"cocoa" + 0.036*"agriculture" + 0.036*"tonne"'),
 (5,
  '0.092*"coffee" + 0.067*"producers" + 0.064*"proposed" + 0.053*"change" + 0.050*"among" + 0.037*"produc

## Retrieve of the predicted topics

In [None]:
from operator import itemgetter
reuters_train = reuters[reuters["purpose"]=="train"]
topics_train = []
for text in reuters_train["doc.text.token"]:
    topics_train.append(max(lda_model[id2word.doc2bow(text)],key=itemgetter(1))[0])
reuters_train['predicted_topic']=topics_train
reuters_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reuters_train['predicted_topic']=topics_train


Unnamed: 0,purpose,doc.title,doc.text,doc.text.token,real_topic,predicted_topic
0,train,BAHIA COCOA REVIEW,showers continued throughout week bahia cocoa ...,"[showers, continued, throughout, week, bahia, ...",28.0,28
4,train,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,the us agriculture department reported farmero...,"[the, us, agriculture, department, reported, f...",10.0,2
5,train,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS,argentine grain board figures show crop regist...,"[argentine, grain, board, figures, show, crop,...",21.0,8
8,train,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT,champion products inc said board directors app...,"[champion, products, inc, said, board, directo...",1.0,21
9,train,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,computer terminal systems inc said completed s...,"[computer, terminal, systems, inc, said, compl...",2.0,21
...,...,...,...,...,...,...
14770,train,N.Z. UNEMPLOYMENT RATE 3.9 PCT IN DECEMBER QUA...,new zealands unemployment rate 39 pct workforc...,"[new, zealands, unemployment, rate, 39, pct, w...",27.0,13
14778,train,BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPENING,the bank japan bought small amount dollars sho...,"[the, bank, japan, bought, small, amount, doll...",13.0,28
14784,train,SOUTH KOREAN WON FIXED AT 25-MONTH HIGH,the bank of korea said it fixed the midrate of...,"[the, bank, of, korea, said, it, fixed, the, m...",3.0,17
14804,train,NIPPON MINING LOWERS COPPER PRICE,nippon mining co ltd said lowered selling pric...,"[nippon, mining, co, ltd, said, lowered, selli...",24.0,21


## Performance metrics

In [2]:
#Clustering Performance
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import fowlkes_mallows_score
print(adjusted_rand_score(reuters_train["real_topic"],reuters_train["predicted_topic"]))
print(normalized_mutual_info_score(reuters_train["real_topic"],reuters_train["predicted_topic"]))
print(adjusted_mutual_info_score(reuters_train["real_topic"],reuters_train["predicted_topic"]))
print(fowlkes_mallows_score(reuters_train["real_topic"],reuters_train["predicted_topic"]))

0.7272164259337847
0.8048761522738317
0.7935778389395145
0.8574752671362691


## Visualization of the generated model topics

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info  = pd.DataFrame({'saliency': saliency, 'Term': vocab, \


# Applying the LDA model to the test subset

In [None]:
reuters_test = reuters[reuters["purpose"]=="test"]
other_corpus = [id2word.doc2bow(text) for text in reuters_test["doc.text.token"]]
topics_test = []
for text in reuters_test["doc.text.token"]:
    topics_test.append(max(lda_model[id2word.doc2bow(text)],key=itemgetter(1))[0])
reuters_test['predicted_topic']=topics_test
reuters_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reuters_test['predicted_topic']=topics_test


Unnamed: 0,purpose,doc.title,doc.text,doc.text.token,real_topic,predicted_topic
14825,test,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT,mounting trade friction us and japan raised fe...,"[mounting, trade, friction, us, and, japan, ra...",6.0,28
14827,test,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS,a survey 19 provinces seven cities showed verm...,"[a, survey, 19, provinces, seven, cities, show...",4.0,17
14828,test,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS,the ministry international trade industry miti...,"[the, ministry, international, trade, industry...",18.0,28
14831,test,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER,thailands trade deficit widened 45 billion bah...,"[thailands, trade, deficit, widened, 45, billi...",29.0,17
14832,test,INDONESIA SEES CPO PRICE RISING SHARPLY,indonesia expects crude palm oil cpo prices ri...,"[indonesia, expects, crude, palm, oil, cpo, pr...",16.0,28
...,...,...,...,...,...,...
21569,test,THAI RICE EXPORTS RISE IN WEEK TO OCTOBER 13,thai rice exports rose 72987 tonnes week ended...,"[thai, rice, exports, rose, 72987, tonnes, wee...",29.0,19
21570,test,N.Z.'S CHASE CORP MAKES OFFER FOR ENTREGROWTH,chase corp ltd ltchcawe said make offer fullyp...,"[chase, corp, ltd, ltchcawe, said, make, offer...",2.0,21
21572,test,TOKYO DEALERS SEE DOLLAR POISED TO BREACH 140 YEN,tokyos foreign exchange market watching nervou...,"[tokyos, foreign, exchange, market, watching, ...",13.0,28
21573,test,JAPAN/INDIA CONFERENCE CUTS GULF WAR RISK CHARGES,the japanindiapakistangulfjapan shipping confe...,"[the, japanindiapakistangulfjapan, shipping, c...",8.0,28


## Performance metrics for the test subset

In [None]:
print(adjusted_rand_score(reuters_test["real_topic"],reuters_test["predicted_topic"]))
print(normalized_mutual_info_score(reuters_test["real_topic"],reuters_test["predicted_topic"]))
print(adjusted_mutual_info_score(reuters_test["real_topic"],reuters_test["predicted_topic"]))
print(fowlkes_mallows_score(reuters_test["real_topic"],reuters_test["predicted_topic"]))

0.5750176258614653
0.4881550145750465
0.46725396432190924
0.6713059354767479
