In [1]:
"""
initial exploration of topic modeling of 
r/technews posts pulled through Reddit API
which is limited to max 1k posts
"""

'\ninitial exploration of r/technews posts pulled through Reddit API\nwhich is limited to max 1k posts\n'

In [2]:
import pandas as pd
import pickle
import datetime

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


In [3]:
with open('post_data_top.pickle', 'rb') as read_file:
    posts_top = pickle.load(read_file)
    
posts_top.shape

(962, 7)

In [5]:
##  make the created time stamp from epoch time into date
posts_top['created'] = posts_top.created.astype(int)
posts_top['date_time'] = pd.to_datetime(posts_top['created'], unit='s')

In [6]:
posts_top.head()

Unnamed: 0,title,score,id,url,num_comms,created,body,date_time
0,"Amazon VP Resigns, Calls Company ‘Chickenshit’...",56834,gdd9d8,https://www.vice.com/en_us/article/z3bjpj/amaz...,1760,1588633651,,2020-05-04 23:07:31
1,Robinhood plummets back down to a one-star rat...,34487,lakvbj,https://www.theverge.com/2021/2/1/22261178/rob...,705,1612260619,,2021-02-02 10:10:19
2,Twitter hides Trump tweet attacking Supreme Co...,29023,jn69ys,https://techcrunch.com/2020/11/02/twitter-trum...,1267,1604418227,,2020-11-03 15:43:47
3,Parler CEO says even his lawyers are abandonin...,24305,kuuq6y,https://www.theverge.com/2021/1/10/22223956/pa...,1485,1610366505,,2021-01-11 12:01:45
4,Trump blocked by Twitter and Facebook,23597,ks63ds,https://www.bbc.co.uk/news/technology-55569604,667,1610024302,,2021-01-07 12:58:22


In [7]:
min(posts_top.date_time)

#Timestamp('2018-06-23 00:31:27')

Timestamp('2018-06-23 00:31:27')

# initial topic modeling

## try NMF

In [33]:
vectorizer = CountVectorizer(stop_words = 'english')
title_words = vectorizer.fit_transform(posts_top.title)
title_words.shape

(962, 3566)

In [58]:
nmf_model = NMF(12)  # tried 5, 7, 10, 12, 15, 20, 25
post_topic = nmf_model.fit_transform(title_words)
post_topic.shape

(962, 12)

In [59]:
topic_word = nmf_model.components_
topic_word.shape

(12, 3566)

In [60]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['facebook', 'ads', 'ban', 'news', 'bans', 'zuckerberg'],
 ['musk', 'tesla', 'elon', 'billion', 'fortune', 'company'],
 ['google', 'search', 'reportedly', 'users', 'tracking', 'france'],
 ['spacex', 'internet', 'starlink', 'service', 'satellite', 'starship'],
 ['new', 'used', 'ai', 'detect', 'plasma', 'wi'],
 ['data', 'caps', 'comcast', 'customers', 'broadband', 'million'],
 ['apple', 'app', 'iphone', 'security', 'watches', 'forced'],
 ['says', 'china', 'world', 'fcc', 'court', 'used'],
 ['twitter', 'trump', 'covid', '19', 'tweets', 'hackers'],
 ['000', 'korea', 'tesla', '10', 'train', 'cars'],
 ['cells', 'scientists', 'cancer', 'using', 'light', 'developed'],
 ['amazon', 'tech', 'news', 'big', 'science', 'ring']]

In [63]:
posts_top['post_topic_nmf'] = post_topic.argmax(axis=1)

In [64]:
posts_top.head()

Unnamed: 0,title,score,id,url,num_comms,created,body,date_time,post_topic_nmf
0,"Amazon VP Resigns, Calls Company ‘Chickenshit’...",56834,gdd9d8,https://www.vice.com/en_us/article/z3bjpj/amaz...,1760,1588634000.0,,2020-05-04 23:07:31,11
1,Robinhood plummets back down to a one-star rat...,34487,lakvbj,https://www.theverge.com/2021/2/1/22261178/rob...,705,1612261000.0,,2021-02-02 10:10:19,2
2,Twitter hides Trump tweet attacking Supreme Co...,29023,jn69ys,https://techcrunch.com/2020/11/02/twitter-trum...,1267,1604418000.0,,2020-11-03 15:43:47,8
3,Parler CEO says even his lawyers are abandonin...,24305,kuuq6y,https://www.theverge.com/2021/1/10/22223956/pa...,1485,1610367000.0,,2021-01-11 12:01:45,7
4,Trump blocked by Twitter and Facebook,23597,ks63ds,https://www.bbc.co.uk/news/technology-55569604,667,1610024000.0,,2021-01-07 12:58:22,8


In [65]:
# Export to Excel to further review posts by NMF topics
posts_top.to_excel('post_topics_sample.xlsx', index=False)

Reviewed the topics assigned to each post title

0. Facebook
1. Tesla, Elon Musk
2. Google
3. SpaceX, NASA
4. variety - commonly of "new"
5. fines, breaches, bad news
6. Apple,  but getting app too and 'apple' the fruit
7. variety
8. about half Twitter, also covid, and other
9. variety
10. Scientists, studies
11. less than half about Amazon


## try LDA

In [73]:
lda_model = LatentDirichletAllocation(n_components=20)
doc_topic = lda_model.fit_transform(topic_word)
doc_topic.shape

(12, 20)

In [74]:
words = vectorizer.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['twitter', 'trump', 'covid', '19', 'tweets', 'accounts'],
 ['amazon', 'tech', 'big', 'science', 'ring', 'japan'],
 ['000', 'korea', 'cars', '10', 'train', 'electric'],
 ['says', 'china', 'thomas', 'wrong', 'sold', 'huawei'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['spacex', 'new', 'internet', 'service', 'starlink', 'broadband'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['apple', 'iphone', 'app', 'security', 'iphones', 'forced'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['remote', 'decades', 'lyft', 'upgraded', 'counter', 'crack'],
 ['cells', 'scientists', 'cancer', 'using', 'light', 'developed'],


In [None]:
# this output is odd that several topics have identical top words
# though LDA is better with larger corpora and this is a sample of 1k posts/sentences

## try TF-IDF vectorizer with NMF

In [75]:
# create the document-term matrix with TF-IDF vectorizer 
cv_tfidf = TfidfVectorizer(stop_words="english")
X_tfidf = cv_tfidf.fit_transform(posts_top.title).toarray()

dt_tfidf = pd.DataFrame(X_tfidf,columns=cv_tfidf.get_feature_names()) 
dt_tfidf

Unnamed: 0,000,000th,060,10,100,100b,100m,100mbps,10pm,11,...,youtuber,zappos,zdnet,zero,zhang,zip,zone,zoom,zte,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
nmf_model = NMF(27)  
post_topic2 = nmf_model.fit_transform(X_tfidf)
post_topic2.shape



(962, 27)

In [89]:
words2 = vectorizer.get_feature_names()
t2 = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t2]
topic_words

[['musk', 'elon', 'tesla', 'company', 'fortune', 'boring'],
 ['facebook', 'ads', 'content', 'news', 'bans', 'zuckerberg'],
 ['spacex', 'starship', 'starlink', 'rocket', 'launches', 'test'],
 ['google', 'search', 'reportedly', 'france', 'billion', 'tracking'],
 ['twitter', 'tweets', 'trump', 'account', 'election', 'misinformation'],
 ['data', 'location', 'selling', 'customers', 'sued', 'carriers'],
 ['apple', 'iphone', 'fined', 'security', 'forced', 'iphones'],
 ['china', 'huawei', 'zoom', 'chipmaker', 'technology', 'censored'],
 ['new', 'detect', 'cell', 'used', '3d', 'feature'],
 ['tesla', 'car', 'electric', 'volkswagen', 'model', 'working'],
 ['covid', '19', 'patients', 'ventilators', 'hospitals', 'treat'],
 ['tech', 'big', 'antitrust', 'science', 'giants', 'news'],
 ['ban', 'trump', 'tiktok', 'app', 'youtube', 'ads'],
 ['says', 'ceo', 'parler', 'carbon', 'zero', 'report'],
 ['amazon', 'reportedly', 'alexa', 'plan', 'workers', 'service'],
 ['microsoft', 'office', 'windows', '10', 'fl

In [78]:
## these 12 topics with tf-idf vectorizer worked better than just count vectorizer
## tried 15 topics - looks pretty good
## 20 topics got bitcoin as separate topic
## 25 looks good too
# tried 27 -- it's putting bitcoin with cars

In [None]:
# Need to add more preprocessing after pull all of the data
# vectorizer did lowercase and removed stop words

# try bi-grams 
# entity recognition
# maybe remove the numbers
# stemming/lemmatization
# POS tagging
