# Some text processing functions in sklearn
## CountVectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
vectorizer = CountVectorizer()
corpus = ['This is the first document.',
          'This is the second second document.',
          'And the third one.',
          'Is this the first document?']
X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [3]:
vectorizer.get_feature_names()



['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [4]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [5]:
vectorizer.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [6]:
vectorizer.transform(['A document with repeated document word']).toarray()

array([[0, 2, 0, 0, 0, 0, 0, 0, 0]])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
cosine_similarity(X)

array([[1.        , 0.63245553, 0.2236068 , 1.        ],
       [0.63245553, 1.        , 0.1767767 , 0.63245553],
       [0.2236068 , 0.1767767 , 1.        , 0.2236068 ],
       [1.        , 0.63245553, 0.2236068 , 1.        ]])

# TfidfVectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(corpus)
X_tfidf

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [10]:
X_tfidf.toarray()

array([[0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674],
       [0.        , 0.27230147, 0.        , 0.27230147, 0.        ,
        0.85322574, 0.22262429, 0.        , 0.27230147],
       [0.55280532, 0.        , 0.        , 0.        , 0.55280532,
        0.        , 0.28847675, 0.55280532, 0.        ],
       [0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674]])

In [11]:
cosine_similarity(X_tfidf)

array([[1.        , 0.43830038, 0.1034849 , 1.        ],
       [0.43830038, 1.        , 0.06422193, 0.43830038],
       [0.1034849 , 0.06422193, 1.        , 0.1034849 ],
       [1.        , 0.43830038, 0.1034849 , 1.        ]])

## HashingVectorizer

In [12]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=5)
hv.transform(corpus).toarray()

array([[ 0.        , -0.57735027,  0.57735027, -0.57735027,  0.        ],
       [ 0.81649658,  0.        ,  0.40824829, -0.40824829,  0.        ],
       [-0.5       ,  0.5       ,  0.        , -0.5       , -0.5       ],
       [ 0.        , -0.57735027,  0.57735027, -0.57735027,  0.        ]])

In [13]:
cosine_similarity(hv.transform(corpus))

array([[ 1.        ,  0.47140452,  0.        ,  1.        ],
       [ 0.47140452,  1.        , -0.20412415,  0.47140452],
       [ 0.        , -0.20412415,  1.        ,  0.        ],
       [ 1.        ,  0.47140452,  0.        ,  1.        ]])

# An Example of Text Clustering
Let's use the tweet data set from assignment 2 ... because we can.

In [14]:
import pandas as pd
tweets_pd = pd.read_json("10000 tweets_clean.zip")
tweets_pd.head()

Unnamed: 0,id,objectType,actor,verb,postedTime,generator,provider,link,text,object,...,twitter_entities,twitter_filter_level,twitter_lang,retweetCount,gnip,twitter_extended_entities,inReplyTo,twitter_quoted_status,location,geo
0,1.497157e+20,activity,"{'objectType': 'person', 'id': '10243188921458...",post,2016-04-01T00:00:01.000Z,"{'displayName': 'Facebook', 'link': 'http://ww...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/losebabyweight1/statuses/71...,CONGRATULATIONS Suzie Walker on both your beau...,"{'objectType': 'note', 'id': '1345715690143449...",...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,
1,1.497157e+20,activity,"{'objectType': 'person', 'id': '1024397578801'...",post,2016-04-01T00:00:01.000Z,"{'displayName': 'Weather Display Tweet', 'link...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/wantirnaweather/statuses/71...,"Wantirna, VIC, AU 11:00 AM Temp 19.8°C, RH 67p...","{'objectType': 'note', 'id': '1345715690141306...",...,"{'hashtags': [{'text': 'vicweather', 'indices'...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,
2,1.497157e+20,activity,"{'objectType': 'person', 'id': '10243126525057...",post,2016-04-01T00:00:01.000Z,"{'displayName': 'Sprinklr', 'link': 'http://ww...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/sas_anz/statuses/7156901408...,Join us @ the Hilton Sydney 2 learn how 2 make...,"{'objectType': 'note', 'id': '1345715690140849...",...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",{'media': [{'id': 'NumberLong(1569014044273459...,,,,
3,1.497157e+20,activity,"{'objectType': 'person', 'id': '10243295689843...",post,2016-04-01T00:00:03.000Z,"{'displayName': 'Facebook', 'link': 'http://ww...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/Letticlothing/statuses/7156...,Say Hello to this Gorgeous Gingham Dress! \n\n...,"{'objectType': 'note', 'id': '1345715690148764...",...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,
4,1.497157e+20,activity,"{'objectType': 'person', 'id': '1024318064228'...",post,2016-04-01T00:00:00.000Z,"{'displayName': 'HubSpot', 'link': 'http://www...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/Intelledox/statuses/7156901...,Register for #Convergence2016 to hear@ChelleMe...,"{'objectType': 'note', 'id': '1345715690137900...",...,"{'hashtags': [{'text': 'Convergence2016', 'ind...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,


In [15]:
tweets_vectorizer = TfidfVectorizer()
vectors = tweets_vectorizer.fit_transform(tweets_pd.text)
vectors

<10000x31628 sparse matrix of type '<class 'numpy.float64'>'
	with 139080 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(vectors)

In [17]:
kmeans.predict(vectors[:5])

array([4, 1, 5, 5, 5], dtype=int32)

# Topic Modelling
Watch the videos at [https://pyldavis.readthedocs.io/en/latest/readme.html](https://pyldavis.readthedocs.io/en/latest/readme.html) for examples of the use of pyLDAvis

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

In [19]:
lda_model = LatentDirichletAllocation(n_components=20, # Number of topics
                                      learning_method='online', # Faster when using large volumes of data
                                      random_state=0,       
                                      n_jobs = -1  # Use all available CPUs
                                     )

LatentDirichetAllocation needs to use count vectors. This is because of the nature of their algorithm. For further details, read this discussion: https://datascience.stackexchange.com/questions/21950/why-we-should-not-feed-lda-with-tfidf

In [20]:
count_vectorizer = CountVectorizer()
count_vectors = count_vectorizer.fit_transform(tweets_pd.text)
lda_output = lda_model.fit_transform(count_vectors)

In [21]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, count_vectors, count_vectorizer, mds='tsne')

  default_term_info = default_term_info.sort_values(


Topics 1 and 2 are much larger than the rest. If you hover over them you will see that most of the words are stop words (which we have not removed). Let's remove stop words and repeat the analysis.

In [22]:
count_vectorizer_nonstop = CountVectorizer(stop_words="english")
count_vectors_nonstop = count_vectorizer_nonstop.fit_transform(tweets_pd.text)
count_vectors_nonstop

<10000x31356 sparse matrix of type '<class 'numpy.int64'>'
	with 94731 stored elements in Compressed Sparse Row format>

In [23]:
lda_model_nonstop = LatentDirichletAllocation(n_components=20, # Number of topics
                                              learning_method='online', # Faster when using large volumes of data
                                              random_state=0,       
                                              n_jobs = -1  # Use all available CPUs
                                              )

In [24]:
lda_output_nonstop = lda_model_nonstop.fit_transform(count_vectors_nonstop)

In [25]:
pyLDAvis.sklearn.prepare(lda_model_nonstop, count_vectors_nonstop, count_vectorizer_nonstop, mds='tsne')

  default_term_info = default_term_info.sort_values(


The following code displays the document vectors. The resulting pandas table shows the topics and columns. Then, each row shows the distribution of topic probabilities for a document. Thus, the sum of numbers of each row should be 1.

For example, we can see that the most prominent topic of document 0 is topic 1. 84.47% of the words from document 0 would belong to topic 1.

In [26]:
lda_vectors = pd.DataFrame(lda_output_nonstop)
lda_vectors.columns = ['Topic '+str(i) for i in range(lda_vectors.shape[1])]
lda_vectors.index = ['Document '+str(i) for i in range(lda_vectors.shape[0])]
lda_vectors.head()

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19
Document 0,0.003846,0.844783,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.085986,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846
Document 1,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.05,0.002381,0.002381,0.002381,0.002381,0.05,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.859524
Document 2,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.217857,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.717857,0.003571,0.003571
Document 3,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.071851,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.868149,0.003333,0.003333,0.003333,0.003333,0.003333
Document 4,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.092793,0.004167,0.096183,0.633519,0.110839,0.004167,0.004167,0.004167,0.004167,0.004167


The following code displays the most likely words for each topic. The resulting pandas table lists the topics as columns, and the contents of each column are words sorted by probability. Thus, the top 3 words of topic 1 are "https", "co", and "rt".

In [27]:
# Function from https://towardsdatascience.com/topic-modeling-quora-questions-with-lda-nmf-aff8dce5e1dd
import numpy as np
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tweets_vectorizer, lda_model=lda_model, n_words=20)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.T



Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19
Word 0,like,https,about,the,to,april,https,10,the,know,game,not,workers,time,rt,congrats,rt,que,gt,rebuild
Word 1,any,co,was,we,you,fools,co,lol,that,turnbull,put,by,exercise,make,de,welcome,launch,eu,after,every
Word 2,long,rt,australia,of,it,week,to,did,has,call,already,strike,shirt,or,thank,true,rally,aldubyayawho,riot,musician
Word 3,lt,the,constitutional,in,are,years,what,fun,have,malcolm,20,going,rt,say,no,talk,bigot,para,2016,la
Word 4,aprilfools,to,left,and,me,support,rt,11,for,free,ask,tropes,brown,stop,then,24,germany,más,threat,starts
Word 5,thanks,in,study,to,rt,until,out,since,be,will,13,bury,pro,sydney,en,bird,dr,sa,follow,three
Word 6,though,is,nice,rt,so,auspol,of,30,of,liberals,adelaide,doesn,link,friday,the100,digital,resist,former,predicts,yo
Word 7,ago,of,one,at,if,maybe,activist,bless,he,damn,once,fuck,by,us,من,lots,만들어버리는,mo,convention,daddy
Word 8,watching,for,round,their,the,feeling,you,data,in,2014,joins,beautiful,almost,around,son,assets,악의를,asx,looking,yours
Word 9,nsw,on,event,see,that,vs,the,km,first,coles,other,photo,click,play,secret,st,무효로,dar,many,bar
