# Some text processing functions in sklearn
## CountVectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
vectorizer = CountVectorizer()
corpus = ['This is the first document.',
          'This is the second second document.',
          'And the third one.',
          'Is this the first document?']
X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [3]:
vectorizer.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [4]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [5]:
vectorizer.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [6]:
vectorizer.transform(['A document with repeated document word']).toarray()

array([[0, 2, 0, 0, 0, 0, 0, 0, 0]])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
cosine_similarity(X)

array([[1.        , 0.63245553, 0.2236068 , 1.        ],
       [0.63245553, 1.        , 0.1767767 , 0.63245553],
       [0.2236068 , 0.1767767 , 1.        , 0.2236068 ],
       [1.        , 0.63245553, 0.2236068 , 1.        ]])

# TfidfVectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(corpus)
X_tfidf

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [10]:
X_tfidf.toarray()

array([[0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674],
       [0.        , 0.27230147, 0.        , 0.27230147, 0.        ,
        0.85322574, 0.22262429, 0.        , 0.27230147],
       [0.55280532, 0.        , 0.        , 0.        , 0.55280532,
        0.        , 0.28847675, 0.55280532, 0.        ],
       [0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674]])

In [11]:
cosine_similarity(X_tfidf)

array([[1.        , 0.43830038, 0.1034849 , 1.        ],
       [0.43830038, 1.        , 0.06422193, 0.43830038],
       [0.1034849 , 0.06422193, 1.        , 0.1034849 ],
       [1.        , 0.43830038, 0.1034849 , 1.        ]])

## HashingVectorizer

In [12]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=5)
hv.transform(corpus).toarray()

array([[ 0.        , -0.57735027,  0.57735027, -0.57735027,  0.        ],
       [ 0.81649658,  0.        ,  0.40824829, -0.40824829,  0.        ],
       [-0.5       ,  0.5       ,  0.        , -0.5       , -0.5       ],
       [ 0.        , -0.57735027,  0.57735027, -0.57735027,  0.        ]])

In [13]:
cosine_similarity(hv.transform(corpus))

array([[ 1.        ,  0.47140452,  0.        ,  1.        ],
       [ 0.47140452,  1.        , -0.20412415,  0.47140452],
       [ 0.        , -0.20412415,  1.        ,  0.        ],
       [ 1.        ,  0.47140452,  0.        ,  1.        ]])

# An Example of Text Clustering
Let's use the tweet data set from assignment 2 ... because we can.

In [14]:
import pandas as pd
tweets_pd = pd.read_json("10000 tweets_clean.zip")
tweets_pd.head()

Unnamed: 0,id,objectType,actor,verb,postedTime,generator,provider,link,text,object,...,twitter_entities,twitter_filter_level,twitter_lang,retweetCount,gnip,twitter_extended_entities,inReplyTo,twitter_quoted_status,location,geo
0,1.497157e+20,activity,"{'objectType': 'person', 'id': '10243188921458...",post,2016-04-01T00:00:01.000Z,"{'displayName': 'Facebook', 'link': 'http://ww...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/losebabyweight1/statuses/71...,CONGRATULATIONS Suzie Walker on both your beau...,"{'objectType': 'note', 'id': '1345715690143449...",...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,
1,1.497157e+20,activity,"{'objectType': 'person', 'id': '1024397578801'...",post,2016-04-01T00:00:01.000Z,"{'displayName': 'Weather Display Tweet', 'link...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/wantirnaweather/statuses/71...,"Wantirna, VIC, AU 11:00 AM Temp 19.8°C, RH 67p...","{'objectType': 'note', 'id': '1345715690141306...",...,"{'hashtags': [{'text': 'vicweather', 'indices'...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,
2,1.497157e+20,activity,"{'objectType': 'person', 'id': '10243126525057...",post,2016-04-01T00:00:01.000Z,"{'displayName': 'Sprinklr', 'link': 'http://ww...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/sas_anz/statuses/7156901408...,Join us @ the Hilton Sydney 2 learn how 2 make...,"{'objectType': 'note', 'id': '1345715690140849...",...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",{'media': [{'id': 'NumberLong(1569014044273459...,,,,
3,1.497157e+20,activity,"{'objectType': 'person', 'id': '10243295689843...",post,2016-04-01T00:00:03.000Z,"{'displayName': 'Facebook', 'link': 'http://ww...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/Letticlothing/statuses/7156...,Say Hello to this Gorgeous Gingham Dress! \n\n...,"{'objectType': 'note', 'id': '1345715690148764...",...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,
4,1.497157e+20,activity,"{'objectType': 'person', 'id': '1024318064228'...",post,2016-04-01T00:00:00.000Z,"{'displayName': 'HubSpot', 'link': 'http://www...","{'objectType': 'service', 'displayName': 'Twit...",http://twitter.com/Intelledox/statuses/7156901...,Register for #Convergence2016 to hear@ChelleMe...,"{'objectType': 'note', 'id': '1345715690137900...",...,"{'hashtags': [{'text': 'Convergence2016', 'ind...",low,en,0,"{'matching_rules': [{'value': 'bio_location: ""...",,,,,


In [15]:
tweets_vectorizer = TfidfVectorizer()
vectors = tweets_vectorizer.fit_transform(tweets_pd.text)
vectors

<10000x31628 sparse matrix of type '<class 'numpy.float64'>'
	with 139080 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(vectors)

In [17]:
kmeans.predict(vectors[:5])

array([2, 5, 1, 1, 1], dtype=int32)

# Topic Modelling
Watch http://stat-graphics.org/movies/ldavis.html for an example of the use of pyLDAvis

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

In [19]:
lda_model = LatentDirichletAllocation(n_components=20, # Number of topics
                                      learning_method='online', # Faster when using large volumes of data
                                      random_state=0,       
                                      n_jobs = -1  # Use all available CPUs
                                     )

LatentDirichetAllocation needs to use count vectors. This is because of the nature of their algorithm. For further details, read this discussion: https://datascience.stackexchange.com/questions/21950/why-we-should-not-feed-lda-with-tfidf

In [20]:
count_vectorizer = CountVectorizer()
count_vectors = count_vectorizer.fit_transform(tweets_pd.text)
lda_output = lda_model.fit_transform(count_vectors)

In [21]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, count_vectors, count_vectorizer, mds='tsne')

Topics 1 and 2 are much larger than the rest. If you hover over them you will see that most of the words are stop words (which we have not removed). Let's remove stop words and repeat the analysis.

In [22]:
count_vectorizer_nonstop = CountVectorizer(stop_words="english")
count_vectors_nonstop = count_vectorizer_nonstop.fit_transform(tweets_pd.text)
count_vectors_nonstop

<10000x31356 sparse matrix of type '<class 'numpy.int64'>'
	with 94731 stored elements in Compressed Sparse Row format>

In [23]:
lda_model_nonstop = LatentDirichletAllocation(n_components=20, # Number of topics
                                              learning_method='online', # Faster when using large volumes of data
                                              random_state=0,       
                                              n_jobs = -1  # Use all available CPUs
                                              )

In [24]:
lda_output_nonstop = lda_model_nonstop.fit_transform(count_vectors_nonstop)

In [28]:
pyLDAvis.sklearn.prepare(lda_model_nonstop, count_vectors_nonstop, count_vectorizer_nonstop, mds='tsne')

In [29]:
lda_vectors = pd.DataFrame(lda_output_nonstop)
lda_vectors.columns = ['Topic '+str(i) for i in range(lda_vectors.shape[1])]
lda_vectors.index = ['Document '+str(i) for i in range(lda_vectors.shape[0])]
lda_vectors.head()

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19
Document 0,0.089548,0.757923,0.087145,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846
Document 1,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.145238,0.811905
Document 2,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.003571,0.146429,0.003571,0.003571,0.789286,0.003571,0.003571
Document 3,0.003333,0.070847,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.869153,0.003333,0.003333,0.003333,0.003333,0.003333
Document 4,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.004167,0.09125,0.64033,0.004167,0.004167,0.004167,0.004167,0.004167,0.197586


In [30]:
# Function from https://towardsdatascience.com/topic-modeling-quora-questions-with-lda-nmf-aff8dce5e1dd
import numpy as np
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tweets_vectorizer, lda_model=lda_model, n_words=20)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.T

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19
Word 0,youtube,https,was,april,then,city,out,10,business,rt,rt,https,rt,say,down,life,to,auspol,live,thank
Word 1,any,co,australia,fools,very,fight,as,predicts,voting,de,always,co,daddy,middle,man,had,the,support,riot,rebuild
Word 2,video,rt,at,friends,htt,continues,of,some,own,constitutional,que,rt,fashion,looking,love,next,you,launch,black,sensanders
Word 3,long,the,off,may,guys,من,back,fun,congrats,important,set,the,youth,sydney,cute,shirt,rt,aussie,follow,crumbling
Word 4,liked,to,got,el,which,running,riot,11,data,link,eu,of,exposed,friday,yesterday,game,it,gold,ht,empower
Word 5,from,in,week,thinking,isn,por,the,fucking,party,cast,na,by,없을,around,true,fam,is,weather,hear,every
Word 6,baby,for,night,rest,michael,في,lol,since,tornado,si,para,activist,뒤엎어서,play,piece,show,that,ass,american,musician
Word 7,ago,of,last,facebook,hair,14,check,between,al,click,three,who,것이다,join,lose,open,and,nswpol,bless,la
Word 8,watching,and,15,kind,under,blue,by,km,listings,auspol,demonstration,clinton,탑골공원의,hot,feels,ready,we,qldpol,full,starts
Word 9,13,on,threat,sale,nice,shooter,is,wind,guess,thecw,the100,in,할배를,50,so,where,are,australian,via,hahaha
