In [70]:
from os import walk
import pandas as pd
import spacy
from gensim import models
from gensim import corpora
import re

nlp = spacy.load("en_core_web_sm")

# Load Data

In [71]:
def read_path(path):
    f = []
    for (dirpath, dirnames, filenames) in walk(path):
        f.extend(filenames)
        break
    return f

In [72]:
names = read_path('news')
print(names)

['2020-11-11.csv', '2020-11-12.csv', '2020-11-13.csv', '2020-11-14.csv', '2020-11-15.csv', '2020-11-16.csv', '2020-11-17.csv', '2020-11-18.csv', '2020-11-19.csv', '2020-11-20.csv', '2020-11-21.csv', '2020-11-22.csv', '2020-11-23.csv', '2020-11-24.csv', '2020-11-25.csv', '2020-11-26.csv', '2020-11-27.csv', '2020-11-28.csv', '2020-11-29.csv', '2020-11-30.csv', '2020-12-1.csv', '2020-12-10.csv', '2020-12-11.csv', '2020-12-12.csv', '2020-12-13.csv', '2020-12-14.csv', '2020-12-15.csv', '2020-12-16.csv', '2020-12-17.csv', '2020-12-18.csv', '2020-12-19.csv', '2020-12-2.csv', '2020-12-20.csv', '2020-12-21.csv', '2020-12-22.csv', '2020-12-23.csv', '2020-12-24.csv', '2020-12-3.csv', '2020-12-4.csv', '2020-12-5.csv', '2020-12-6.csv', '2020-12-7.csv', '2020-12-8.csv', '2020-12-9.csv']


In [73]:
df_news = []

for file_name in names:
    df = pd.read_csv("news/"+file_name)
    df = df.loc[:, ['description']]
    df_news.append(df)

df_news = pd.concat(df_news)
df_news = df_news.reset_index()
del df_news['index']
print(len(df_news))
df_news.head()

2840


Unnamed: 0,description
0,The pro-democracy media billionaire is the mos...
1,The film is to be the 78-year-old actor's fift...
2,Trials of the Australian vaccine returned fals...
3,The US president-elect and his running mate be...
4,Volunteers have been asked to take and categor...


In [74]:
df_news['description'][25]

"Vietnam's information minister on Tuesday accused foreign streaming companies like Netflix and Apple of skirting their tax responsibilities, saying it would create unfair competition for domestic firms."

# word segmentation / lemma / tag remove

https://spacy.io/api/token

In [75]:
def strip(s):
    s = str(s)
    
    p = re.compile(r'<.*?>')
    s = p.sub('', s)
    
    p = re.compile(r'[\n\r\t]')
    s = p.sub('', s)
    
    return s

f = lambda x: [i.lemma_ for i in nlp(strip(x)) if not i.is_stop and i.is_alpha]
f("How are you! i'm fine thx <br>\t\n")

['fine', 'thx']

In [76]:
df_news['description'] = df_news['description'].apply(f)

In [77]:
print(df_news['description'][25])

['Vietnam', 'information', 'minister', 'Tuesday', 'accuse', 'foreign', 'stream', 'company', 'like', 'Netflix', 'Apple', 'skirt', 'tax', 'responsibility', 'say', 'create', 'unfair', 'competition', 'domestic', 'firm']


In [78]:
df_news.head()

Unnamed: 0,description
0,"[pro, democracy, medium, billionaire, high, pr..."
1,"[film, old, actor, fifth, final, instalment, f..."
2,"[trial, australian, vaccine, return, false, po..."
3,"[president, elect, run, mate, beat, finalist, ..."
4,"[volunteer, ask, categorise, thousand, photo, ..."


# bulid Dictionary

In [79]:
corpus = [t for t in df_news['description']]
dictionary = corpora.Dictionary(corpus)

In [80]:
print(dictionary)

Dictionary(11192 unique tokens: ['billionaire', 'controversial', 'democracy', 'fall', 'figure']...)


# to bag-of-ward

In [81]:
corpus_bow = [dictionary.doc2bow(text) for text in texts]

In [82]:
print(corpus_bow)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)], [(29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1)], [(38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)], [(28, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1)], [(6, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1)], [(25, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1)], [(71, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1)], [(78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1)], [(88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (10

# to tf-idf

In [83]:
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

# NMF

https://radimrehurek.com/gensim/models/nmf.html

In [104]:
from gensim.models.nmf import Nmf
num_topics = 5

In [105]:
nmf = Nmf(corpus_tfidf, id2word=dictionary, num_topics=num_topics) #train
corpus_nmf = nmf[corpus_tfidf] #apply

In [106]:
for i,j in nmf.show_topics(num_topics=num_topics, num_words=10):
    j = re.sub("(\+ )?\-?\d*\.\d+\*?","",j)
    j = re.sub('"',"",j)
    print("Topic",i,":",j.split(" "))

Topic 0 : ['despite', 'plea', 'Kardashian', 'Kim', 'execution', 'Bernard', 'Indiana', 'Brandon', 'federal', 'EU']
Topic 1 : ['Trump', 'president', 'elect', 'run', 'mate', 'finalist', 'Donald', 'beat', 'include', 'safe']
Topic 2 : ['阅读全文', 'fall', 'foul', 'billionaire', 'profile', 'controversial', 'law', 'democracy', 'medium', 'figure']
Topic 3 : ['home', 'family', 'shoot', 'dead', 'Mr', 'dentist', 'Goodson', 'appointment', 'de', 'say']
Topic 4 : ['Apple', 'new', 'year', 'Google', 'event', 'vaccine', 'rise', 'smart', 'find', 'daily']


  j = re.sub("(\+ )?\-?\d*\.\d+\*?","",j)


# LSI

https://radimrehurek.com/gensim/models/lsimodel.html?highlight=lsa

In [107]:
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
corpus_lsi = lsi_model[corpus_tfidf]

In [108]:
for i,j in lsi_model.show_topics(num_topics=num_topics, num_words=10):
    j = re.sub("(\+ )?\-?\d*\.\d+\*?","",j)
    j = re.sub('"',"",j)
    print("Topic",i,":",j.split(" "))

Topic 0 : ['return', 'say', 'vaccine', 'appointment', 'Mr', 'Goodson', 'dentist', 'shoot', 'dead', 'family']
Topic 1 : ['high', 'White', 'fatality', 'House', 'daily', 'hold', 'coronavirus', 'relate', 'rise', 'record']
Topic 2 : ['vaccine', 'Johnson', 'Boris', 'negotiation', 'EU', 'continue', 'Drug', 'deem', 'Food', 'adviser']
Topic 3 : ['finalist', 'mate', 'Donald', 'elect', 'Trump', 'beat', 'president', 'run', 'include', 'Boris']
Topic 4 : ['Indiana', 'Bernard', 'execution', 'Kim', 'Kardashian', 'Brandon', 'plea', 'federal', 'despite', 'set']


  j = re.sub("(\+ )?\-?\d*\.\d+\*?","",j)


# LDA

In [109]:
lda_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
corpus_lda = lda_model[corpus_tfidf]

In [110]:
for i,j in lda_model.show_topics(num_topics=num_topics, num_words=10):
    j = re.sub("(\+ )?\-?\d*\.\d+\*?","",j)
    j = re.sub('"',"",j)
    print("Topic",i,":",j.split(" "))

Topic 0 : ['vaccine', 'Administration', 'deem', 'Drug', 'Food', 'adviser', 'panel', 'safe', 'effective', 'Apple']
Topic 1 : ['Apple', 'Google', 'Black', 'deal', 'new', 'Facebook', 'Friday', 'return', 'good', 'app']
Topic 2 : ['Apple', 'figure', 'profile', 'billionaire', 'foul', 'democracy', 'law', 'controversial', 'fall', 'medium']
Topic 3 : ['Apple', 'continue', 'photo', 'thousand', 'ask', 'categorise', 'Great', 'Barrier', 'volunteer', 'Reef']
Topic 4 : ['Apple', 'rise', 'event', 'say', 'Google', 'relate', 'hold', 'fatality', 'White', 'House']


  j = re.sub("(\+ )?\-?\d*\.\d+\*?","",j)


In [111]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [112]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus_bow, dictionary)
LDAvis_prepared