In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx


In [467]:
articles=pd.read_csv("Medium_articles.csv")
articles

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...
...,...,...,...,...,...,...
332,Daniel Simmons,3.4K,8,https://itnext.io/you-can-build-a-neural-netwo...,You can build a neural network in JavaScript e...,Click here to share this article on LinkedIn »...
333,Eugenio Culurciello,2.8K,13,https://towardsdatascience.com/artificial-inte...,"Artificial Intelligence, AI in 2018 and beyond...",These are my opinions on where deep neural net...
334,Devin Soni,5.8K,4,https://towardsdatascience.com/spiking-neural-...,"Spiking Neural Networks, the Next Generation o...",Everyone who has been remotely tuned in to rec...
335,Carlos E. Perez,3.9K,7,https://medium.com/intuitionmachine/neurons-ar...,Surprise! Neurons are Now More Complex than We...,One of the biggest misconceptions around is th...


In [468]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        337 non-null    object
 1   claps         337 non-null    object
 2   reading_time  337 non-null    int64 
 3   link          337 non-null    object
 4   title         337 non-null    object
 5   text          337 non-null    object
dtypes: int64(1), object(5)
memory usage: 15.9+ KB


In [491]:
# Check what text column look like.
print(articles.loc[1].text)

if you’ve ever found yourself looking up the same question, concept, or syntax over and over again when programming, you’re not alone.
i find myself doing this constantly.
while it’s not unnatural to look things up on stackoverflow or other resources, it does slow you down a good bit and raise questions as to your complete understanding of the language.
we live in a world where there is a seemingly infinite amount of accessible, free resources looming just one search away at all times. however, this can be both a blessing and a curse. when not managed effectively, an over-reliance on these resources can build poor habits that will set you back long-term.
personally, i find myself pulling code from similar discussion threads several times, rather than taking the time to learn and solidify the concept so that i can reproduce the code myself the next time.
this approach is lazy and while it may be the path of least resistance in the short-term, it will ultimately hurt your growth, product

In [472]:
# First, 
# Preprocess the data and obtain the most frequent words across all the articles. 
# Try to iterate it at least three times, 
# and in each iteration I will extend the set of stopwords with new ones 
# based on the words I obtained as frequently occurring but are not particularly informative when I try to understand what the news is about.

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords



In [474]:
# Change "text" column to lowercase 
articles["text"] = articles["text"].str.lower()
articles

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"oh, how the headlines blared:\nchatbots were t..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,if you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,if your understanding of a.i. and machine lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,want to learn about applied artificial intelli...
...,...,...,...,...,...,...
332,Daniel Simmons,3.4K,8,https://itnext.io/you-can-build-a-neural-netwo...,You can build a neural network in JavaScript e...,click here to share this article on linkedin »...
333,Eugenio Culurciello,2.8K,13,https://towardsdatascience.com/artificial-inte...,"Artificial Intelligence, AI in 2018 and beyond...",these are my opinions on where deep neural net...
334,Devin Soni,5.8K,4,https://towardsdatascience.com/spiking-neural-...,"Spiking Neural Networks, the Next Generation o...",everyone who has been remotely tuned in to rec...
335,Carlos E. Perez,3.9K,7,https://medium.com/intuitionmachine/neurons-ar...,Surprise! Neurons are Now More Complex than We...,one of the biggest misconceptions around is th...


In [515]:
# Removing the stopwords while keeping the string format and not create list.

articles['split'] = articles["text"].apply(lambda x: x.split())

# Remove the stopwrods. 

stop_words = list(stopwords.words("english"))

articles['split'] = articles['split'].apply(lambda x: [item for item in x if item not in stop_words])

# Get back the single string format.
articles['split_final'] = articles['split'].apply(lambda x: ' '.join(x))
articles.head()

Unnamed: 0,author,claps,reading_time,link,title,text,split,split_final
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"oh, how the headlines blared:\nchatbots were t...","[oh,, headlines, blared:, chatbots, next, big,...","oh, headlines blared: chatbots next big thing...."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,if you’ve ever found yourself looking up the s...,"[you’ve, ever, found, looking, question,, conc...","you’ve ever found looking question, concept, s..."
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,machine learning is increasingly moving from h...,"[machine, learning, increasingly, moving, hand...",machine learning increasingly moving hand-desi...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,if your understanding of a.i. and machine lear...,"[understanding, a.i., machine, learning, big, ...",understanding a.i. machine learning big questi...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,want to learn about applied artificial intelli...,"[want, learn, applied, artificial, intelligenc...",want learn applied artificial intelligence lea...


In [477]:
# Count the most frequent words across all the articles.

from sklearn.feature_extraction.text import CountVectorizer

# Initialize an object. 
vect = CountVectorizer()

# Create the representation.

article_counts = vect.fit_transform(articles['split_final'])

# Check the size of the resulting data.

article_counts.shape

# 337 rows corresponding to the original articles, 
# and 20393 columns for the words that appear in at least 1 article.

(337, 20393)

In [478]:
article_counts

<337x20393 sparse matrix of type '<class 'numpy.int64'>'
	with 187415 stored elements in Compressed Sparse Row format>

In [479]:
# Convert it to a dataframe

article_counts_df = pd.DataFrame(article_counts.toarray(), columns=vect.get_feature_names())

article_counts_df.head()

Unnamed: 0,00,000,0000,00000,0001,0002which,00061,00078,000assuming,000s,...,记录一下,说实话,还是看代码比较有感觉,还是要多对照着代码看,这一阵为了工作上的关系,这个步骤并不适合各位读博士发论文的同学们,这样前前后后,都没读完而且得到到信息也很有限,除了集体智慧编程这本书之外基本没怎么看过机器学习的人来说,高久力
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [525]:
for i in range (1,337):
    if "除了集体智慧编程这本书之外基本没怎么看过机器学习的人来说" in articles.loc[i].split_final:
        print (i, "article")

23 article


In [557]:
print(articles.loc[23].split)
# Interesting, There is an article in Chinese. and it was not be splited.

['这一阵为了工作上的关系,花了点时间学习了一下lda算法,说实话,对于我这个学cs而非学数学的人来说,除了集体智慧编程这本书之外基本没怎么看过机器学习的人来说,一开始还真是摸不太到门道,前前后后快要四个月了,算是基本了解了这个算法的实现,记录一下,也供后来人快速入门做个参考。', '一开始直接就下了blei的原始的那篇论文来看,但是看了个开头就被dirichlet分布和几个数学公式打倒,然后因为专心在写项目中的具体的代码,也就先放下了。但是因为发现完全忘记了本科学的概率和统计的内容,只好回头去看大学时候概率论的教材,发现早不知道借给谁了,于是上网买了本,花了几天时间大致回顾了一遍概率论的知识,什么贝叶斯全概率公式,正态分布,二项分布之类的。后来晚上没事儿的时候,去水木的ai版转了转,了解到了machine', 'learning的圣经prml,考虑到反正也是要长期学习了,搞了电子版,同时上淘宝买了个打印胶装的版本。春节里每天晚上看一点儿,扫了一下前两章,再次回顾了一下基本数学知识,然后了解了下贝叶斯学派那种采用共轭先验来建模的方式。于是再次尝试回头去看blei的那篇论文,发现还是看不太懂,于是又放下了。然后某天tony让我准备准备给复旦的同学们share一下我们项目中lda的使用,为了不露怯,又去翻论文,正好看到science上这篇topic', 'models', 'vs.', 'unstructured', 'data的科普性质的文章,翻了一遍之后,再去prml里看了一遍graphic', 'models那一张,觉得对于lda想解决的问题和方法了解了更清楚了。之后从search', 'engine里搜到这篇文章,然后根据推荐读了一部分的gibbs', 'sampling', 'uninitiated。之后忘了怎么又搜到了mark', 'steyvers和tom', 'griffiths合著的probabilistic', 'topic', 'models,在某个周末往返北京的飞机上读完了,觉得基本上模型训练过程也明白了。再之后就是读了一下这个最简版的lda', 'gibbs', 'sampling的实现,再回过头读了一下plda的源码,基本上算是对lda有了个相对清楚的了解。', '这样前前后后,也过去了三个月,其实不少时间都是浪费掉的,比如blei的论文在没

In [501]:
word_count = article_counts_df.sum(axis=0).sort_values()
word_count[:20]

高久力                1
programatically    1
diffe              1
programing         1
diferencias        1
diferenciar        1
progre             1
dies               1
dient              1
prohibitive        1
prohibitively      1
projections        1
profitable         1
projetos           1
dictation          1
dictated           1
promoted           1
promotes           1
diat               1
promotion          1
dtype: int64

In [505]:
sum = 0
for n in word_count.values:
    if n ==1:
        sum+=1
sum

## There are 6649 words only occur once across all the articles.

6649

In [506]:
word_count = article_counts_df.sum(axis=0).sort_values(ascending = False)
word_count[:30]

learning    2837
data        2646
one         1914
it          1904
network     1794
machine     1644
like        1623
neural      1529
time        1344
use         1283
would       1250
ai          1216
also        1178
model       1171
much        1062
deep        1061
using       1049
new          988
training     945
image        936
human        904
make         895
way          893
we           880
get          857
you          847
could        842
people       838
us           831
need         830
dtype: int64

In [564]:
# Iterate the second time. Remove ["it","could","would"]

stop_words_new = stop_words + ["it","could","would"]

vect_new = CountVectorizer(stop_words = stop_words_new)

article_counts_new = vect_new.fit_transform(articles['split_final'])

# Convert it to dataframe

article_counts_df_new = pd.DataFrame(article_counts_new.toarray(), columns=vect_new.get_feature_names())


word_count_new = article_counts_df_new.sum(axis=0).sort_values(ascending = False)
word_count_new[:30]

learning    2837
data        2646
one         1914
network     1794
machine     1644
like        1623
neural      1529
time        1344
use         1283
ai          1216
also        1178
model       1171
much        1062
deep        1061
using       1049
new          988
training     945
image        936
human        904
make         895
way          893
get          857
people       838
us           831
need         830
first        819
many         808
see          773
example      771
work         767
dtype: int64

In [568]:
test_list = ["you","we","if"]

for word in test_list:
    if word in stop_words:
        print (word, "in stop words")

# words in test_list all in stop_words, but it all occurs when I first iterate to create the word_count.
        
for word in test_list:
    try: 
        word_count_new.loc[word]
    except:
        print ("not in the word_count_new")


you in stop words
we in stop words
if in stop words
not in the word_count_new 
not in the word_count_new 
not in the word_count_new 


In [569]:
# Iterate the Third time.

stop_words_3 = stop_words_new + ["many","us","also","people","much"]

vect_3 = CountVectorizer(stop_words = stop_words_3)

article_counts_3 = vect_3.fit_transform(articles['split_final'])

# Convert it to dataframe

article_counts_df_3 = pd.DataFrame(article_counts_3.toarray(), columns=vect_3.get_feature_names())


word_count_3 = article_counts_df_3.sum(axis=0).sort_values(ascending = False)
word_count_3[:30]

learning     2837
data         2646
one          1914
network      1794
machine      1644
like         1623
neural       1529
time         1344
use          1283
ai           1216
model        1171
deep         1061
using        1049
new           988
training      945
image         936
human         904
make          895
way           893
get           857
need          830
first         819
see           773
example       771
networks      767
work          767
even          762
used          754
learn         743
different     731
dtype: int64

In [570]:
# In the second step, perform topic modeling on the data with specifying four topics to be extracted. Based
# on looking at the top 15 words from each topic, can you differentiate them and explain how they are
# different?

In [571]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components = 4, random_state = 42)

LDA_results = LDA.fit_transform(article_counts_3)

In [572]:
# The relavent to each topic of Each word in each article
LDA.components_

array([[ 0.25038791, 42.12387408,  0.25040504, ...,  0.25000181,
         0.25000181,  0.25329847],
       [ 0.25000144,  1.46722964,  0.25000068, ...,  0.25000869,
         0.25000869,  0.25000227],
       [ 0.28118556, 37.61910558,  0.25000018, ...,  0.25000262,
         0.25000262,  0.25121443],
       [ 2.2184251 , 70.78979069,  4.2495941 , ...,  1.24998688,
         1.24998688,  1.24548483]])

In [573]:
for topic, component in enumerate(LDA.components_):

    words_sorted = np.argsort(component)[-15:]
    
    print([vect_3.get_feature_names()[i] for i in words_sorted])

['layer', 'learn', 'function', 'like', 'time', 'using', 'use', 'training', 'model', 'one', 'neural', 'machine', 'network', 'data', 'learning']
['object', 'features', 'feature', 'et', 'see', 'que', 'bounding', 'boxes', 'like', 'region', 'la', 'one', 'image', 'de', 'cnn']
['use', 'way', 'world', 'even', 'technology', 'information', 'may', 'time', 'new', 'intelligence', 'one', 'like', 'data', 'human', 'ai']
['game', 'training', 'part', 'model', 'image', 'use', 'time', 'machine', 'deep', 'like', 'one', 'data', 'neural', 'network', 'learning']


In [574]:
# 0 
# Maybe articles about neural network machine leaning, maybe more theoretically because there are words like "layer"

# 1
# Maybe articles about CNN to analyze image. 

# 2
# Maybe articles about new technology about AI 

# 3 
# Maybe articles about neural network deep leaning in game and image, maybe more about application.