In [1]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd /content/gdrive/MyDrive/Colab Notebooks/13 - NLP/

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/13 - NLP


In [None]:
pip install pyLDAvis

In [None]:
pip install umap

In [None]:
pip install unidecode

In [19]:
import sys
import umap, re, string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lib.plot_helper as phelper
import unidecode
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
import spacy, nltk
from nltk.corpus import stopwords
nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Latent Dirichlet Allocation

In [9]:
def text_replace_series(series):
    series = series.str.replace("(<br/>)", "")
    series = series.str.replace('(<a).*(>).*(</a>)', '')
    series = series.str.replace('(&amp)', '')
    series = series.str.replace('(&gt)', '')
    series = series.str.replace('(&lt)', '')
    series = series.str.replace('(\xa0)', ' ')
    return series

def preprocess_text(text, token=True):
    text = re.sub('[^a-zA-Z#]', ' ', text)
    text = unidecode.unidecode(text)
    text = [t.lower() for t in text.split() if len(t)>3]
    text = [t for t in text if t not in stop_words]
    # sent = nlp(' '.join(text)
    # text = [t.lemma_ for t in sent]
    
    # Join the characters again to form the string.
    if token == False:
        text = ' '.join(text)

    return text

## Model Training

In [10]:
npr = pd.read_csv('/data/npr.csv')
npr['Article'] = npr['Article'].apply(lambda x: preprocess_text(x, token=False))
npr.head()

Unnamed: 0,Article
0,washington even policy bipartisan politics can...
1,donald trump used twitter preferred means comm...
2,donald trump unabashedly praising russian pres...
3,updated russian president vladimir putin says ...
4,photography illustration video data visualizat...


In [11]:
# add new stopwords
# max_df ignore terms that have a document frequency strictly higher than the given threshold, show up too frequent
# min_df ignore terms that have a document frequency strictly lower than the given threshold, too rare
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words=nlp.Defaults.stop_words)
dtm = cv.fit_transform(npr['Article'])

# must give the number of topics(n_components)
# doc_topic_prior, Higher the alpha, documents are composed of more topics
# topic_word_prior, Higher the beta, topics are composed of a large number of words
LDA = LatentDirichletAllocation(n_components=10, random_state=42, max_iter=20, n_jobs=-1)
# LDA.fit(dtm)
lda_topic_matrix = LDA.fit_transform(dtm)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ll', 've'] not in stop_words.



In [12]:
# lda_topic_matrix comprise of 6 probabilties (of 10 topics) for 22641 documents
# lda_keys contains the topic each document belong to
# lda_dict stores the total number of documents of each topics
lda_keys = lda_topic_matrix.argmax(axis=1).tolist()
lda_dicts = {a : b for a, b in Counter(lda_keys).items()}
lda_dicts

{0: 2816,
 1: 757,
 2: 925,
 3: 939,
 4: 1018,
 5: 1207,
 6: 1051,
 7: 331,
 8: 1604,
 9: 1344}

In [13]:
# get the number of vocab
print(len(cv.get_feature_names()))
print(cv.get_feature_names()[6000])

51763
bumbling


In [14]:
# 10 topic, with assigned probabilities to each vocab
print(lda_topic_matrix.shape)
print(LDA.components_.shape)

(11992, 10)
(10, 51763)


In [15]:
single_topic = LDA.components_[0]
# returns the indices of that Top 5 words for this topic with highest probabilities.
[cv.get_feature_names()[index] for index in single_topic.argsort()[-10:]]

['going',
 'world',
 'years',
 'life',
 'know',
 'think',
 'time',
 'people',
 'says',
 'like']

In [16]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(LDA.components_):
    top_words = [cv.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)
    top_words_dict[index] = ' '.join(top_words)

['going', 'world', 'years', 'life', 'know', 'think', 'time', 'people', 'says', 'like']
['year', 'national', 'farmers', 'land', 'climate', 'years', 'like', 'water', 'food', 'says']
['political', 'campaign', 'country', 'women', 'party', 'people', 'president', 'trump', 'clinton', 'said']
['state', 'told', 'justice', 'case', 'court', 'says', 'black', 'people', 'police', 'said']
['money', 'court', 'million', 'companies', 'said', 'state', 'government', 'federal', 'company', 'says']
['russia', 'news', 'administration', 'campaign', 'white', 'obama', 'house', 'president', 'said', 'trump']
['parents', 'album', 'education', 'song', 'schools', 'says', 'like', 'music', 'students', 'school']
['election', 'sanders', 'cruz', 'states', 'vote', 'clinton', 'voters', 'state', 'percent', 'trump']
['patients', 'medical', 'research', 'like', 'study', 'percent', 'care', 'people', 'health', 'says']
['told', 'attack', 'reports', 'country', 'government', 'military', 'city', 'people', 'said', 'says']


In [17]:
npr['Topic'] = lda_topic_matrix.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,washington even policy bipartisan politics can...,5
1,donald trump used twitter preferred means comm...,5
2,donald trump unabashedly praising russian pres...,5
3,updated russian president vladimir putin says ...,5
4,photography illustration video data visualizat...,4


In [20]:
phelper.get_bar(
    list(lda_dicts.keys()), 
    list(lda_dicts.values()), 
    ytitle='Number of Topics',
    title='LDA topic counts')

Filter based on Part of Speech Tag to improve Accuracy <br/>
In order to retrieve most important topic terms, a corpus can be divided into batches of fixed sizes. 

# Latent Semantic Analysis

In [21]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].apply(lambda x: preprocess_text(x, token=False))
news_df.head()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure story seem biased disagree statement...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize principle strongest points wo...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss proposal much ...
4,"Well, I will have to change the scoring on my ...",well change scoring playoff pool unfortunately...


In [22]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, smooth_idf=True)
X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape

(11314, 64741)

In [23]:
# doc_topic_prior, Higher the alpha, documents are composed of more topics
# topic_word_prior, Higher the beta, topics are composed of a large number of words

lsa_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
lsa_topic_matrix = lsa_model.fit_transform(X)
len(lsa_model.components_)

20

In [24]:
lsa_keys = lsa_topic_matrix.argmax(axis=1).tolist()
lsa_dicts = {a : b for a, b in Counter(lsa_keys).items()}
lsa_dicts

{0: 7457,
 1: 323,
 2: 267,
 3: 788,
 4: 272,
 5: 110,
 6: 407,
 7: 250,
 8: 7,
 9: 190,
 10: 124,
 11: 202,
 12: 210,
 13: 69,
 14: 128,
 15: 91,
 16: 98,
 17: 40,
 18: 192,
 19: 89}

In [25]:
# 10 topic, with assigned probabilities to each vocab
print(lsa_topic_matrix.shape)
print(lsa_model.components_.shape)

(11314, 20)
(20, 64741)


In [26]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(lsa_model.components_):
    top_words = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)

['want', 'need', 'windows', 'thanks', 'time', 'good', 'think', 'people', 'like', 'know']
['files', 'mail', 'disk', 'video', 'scsi', 'file', 'card', 'drive', 'thanks', 'windows']
['cable', 'power', 'card', 'floppy', 'disk', 'controller', 'hard', 'drives', 'scsi', 'drive']
['league', 'play', 'hockey', 'thanks', 'season', 'players', 'year', 'games', 'team', 'game']
['number', 'algorithm', 'phone', 'data', 'escrow', 'keys', 'government', 'clipper', 'encryption', 'chip']
['soon', 'banks', 'gordon', 'surrender', 'skepticism', 'intellect', 'shameful', 'chastity', 'cadre', 'pitt']
['post', 'information', 'looking', 'address', 'email', 'info', 'advance', 'know', 'mail', 'thanks']
['modem', 'sale', 'price', 'driver', 'chip', 'drivers', 'cards', 'monitor', 'video', 'card']
['team', 'government', 'clipper', 'game', 'encryption', 'scsi', 'chip', 'windows', 'thanks', 'know']
['state', 'arab', 'government', 'jews', 'people', 'turkish', 'israeli', 'armenians', 'armenian', 'israel']
['really', 'right',

# Non-negative Matrix Factorization

Use NMF(highest coefficients) instead of LDA (highest probabilites).

In [27]:
npr = pd.read_csv('./data/npr.csv')
npr['Article'] = npr['Article'].apply(lambda x: preprocess_text(x, token=False))
npr.head()

Unnamed: 0,Article
0,washington even policy bipartisan politics can...
1,donald trump used twitter preferred means comm...
2,donald trump unabashedly praising russian pres...
3,updated russian president vladimir putin says ...
4,photography illustration video data visualizat...


In [28]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article'])
nmf_model = NMF(n_components=10, random_state=42)
nmf_topic_matrix = nmf_model.fit_transform(dtm)

In [29]:
print(len(tfidf.get_feature_names()))
print(nmf_model.components_.shape)
print(nmf_topic_matrix.shape)

51757
(10, 51757)
(11992, 10)


In [30]:
nmf_keys = nmf_topic_matrix.argmax(axis=1).tolist()
nmf_dicts = {a : b for a, b in Counter(nmf_keys).items()}
nmf_dicts

{0: 2567,
 1: 1171,
 2: 654,
 3: 573,
 4: 1402,
 5: 3116,
 6: 667,
 7: 355,
 8: 679,
 9: 808}

In [31]:
top_words_dict = {}
# users got to dedice the topic name
for index, topic in enumerate(nmf_model.components_):
    top_words = [tfidf.get_feature_names()[i] for i in topic.argsort()[-10:]]
    print(top_words)
    top_words_dict[index] = ' '.join(top_words)

['study', 'company', 'percent', 'china', 'years', 'like', 'water', 'food', 'people', 'says']
['presidential', 'obama', 'house', 'white', 'republican', 'campaign', 'donald', 'said', 'president', 'trump']
['republicans', 'plan', 'people', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']
['vote', 'party', 'state', 'delegates', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']
['forces', 'people', 'killed', 'reports', 'city', 'isis', 'attack', 'officers', 'said', 'police']
['women', 'life', 'people', 'album', 'know', 'really', 'song', 'think', 'like', 'music']
['parents', 'children', 'college', 'kids', 'teachers', 'student', 'education', 'schools', 'school', 'students']
['pregnant', 'microcephaly', 'cases', 'health', 'mosquitoes', 'disease', 'mosquito', 'women', 'virus', 'zika']
['putin', 'said', 'investigation', 'committee', 'president', 'flynn', 'intelligence', 'russian', 'comey', 'russia']
['case', 'order', 'president', 'state', 'feder

In [32]:
topic_results = nmf_model.transform(dtm)
npr['Topic'] = topic_results.argmax(axis=1)
npr.head(10)

Unnamed: 0,Article,Topic
0,washington even policy bipartisan politics can...,8
1,donald trump used twitter preferred means comm...,1
2,donald trump unabashedly praising russian pres...,8
3,updated russian president vladimir putin says ...,8
4,photography illustration video data visualizat...,6
5,want join yoga class hated beatific instructor...,5
6,publicly supported debunked claim vaccines cau...,0
7,standing airport exit debating whether snack y...,0
8,movies trying realistic perhaps summon batman ...,0
9,eighteen years year david fisher visited farm ...,0


In [34]:
phelper.get_bar(
    list(nmf_dicts.keys()), 
    list(nmf_dicts.values()), 
    ytitle='Number of Topics',
    title='NMF topic counts')