#Topic Modeling Tweets

Using tweets with the hashtag #micropoetry to better understand the popularity of poetic topics in the post internet age

import stuff

In [57]:
import os
import glob
import numpy as np
import sklearn.feature_extraction.text as text
from sklearn import decomposition
import matplotlib.pyplot as plt
import operator
import pandas as pd
import re
import operator

put tweets in a list

In [58]:
filenames = sorted(glob.glob('twitter/Individual Tweets/*'))[:1000]

We're only using the first thousand tweets

In [59]:
print(len(filenames))

1000


check the first 5 tweets

In [60]:
print(filenames[:5])

['twitter/Individual Tweets/00001.txt', 'twitter/Individual Tweets/00002.txt', 'twitter/Individual Tweets/00003.txt', 'twitter/Individual Tweets/00004.txt', 'twitter/Individual Tweets/00005.txt']


Convert the collection of text documents to a matrix of token counts:

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [61]:
vectorizer = text.CountVectorizer(input='filename', stop_words='english', min_df=20)

documentation on fit_transform and get_feature_names:

http://scikit-learn.org/stable/modules/feature_extraction.html

In [62]:
dtm = vectorizer.fit_transform(filenames).toarray()

In [63]:
vocab = np.array(vectorizer.get_feature_names())

In [64]:
print(dtm.shape)

(1000, 26)


In [65]:
print(len(vocab))

26


Set how many topics we want

In [66]:
num_topics = 10

Set how many words we want in each topic

In [67]:
num_top_words = 20

Non-Negative Matrix Factorization (NMF)

Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X.

http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

In [68]:
clf = decomposition.NMF(n_components=num_topics, random_state=1)

In [69]:
doctopic = clf.fit_transform(dtm)



In [70]:
topic_words = []

In [71]:
for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])

In [72]:
doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

make an empty list

In [73]:
novel_names = []

fill the list with file names 0-1000

In [74]:
for fn in filenames:
    basename = os.path.basename(fn)
    name, ext = os.path.splitext(basename)
    novel_names.append(name)

make the list an array

In [75]:
novel_names = np.asarray(novel_names)

In [76]:
doctopic_orig = doctopic.copy()

In [77]:
num_groups = len(set(novel_names))

In [78]:
doctopic_grouped = np.zeros((num_groups, num_topics))

In [79]:
for i, name in enumerate(sorted(set(novel_names))):
    doctopic_grouped[i, :] = np.mean(doctopic[novel_names == name, :], axis=0)

In [80]:
doctopic = doctopic_grouped

In [81]:
novels = sorted(set(novel_names))

In [82]:
d = {}
for i in range(len(doctopic)):
    top_topics = np.argsort(doctopic[i,:])[::-1][0:1]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    d[novels[i]] = top_topics_str
    #print("{}: {}".format(novels[i], top_topics_str))
s = sorted(d.items(), key=operator.itemgetter(0))


open up a comma-separated values sheet

In [83]:
micro_poems_csv = pd.read_csv('micro_poetry.csv', header=True, encoding='latin1')

In [84]:
len(micro_poems_csv)

21632

make a dictionary out of it

In [85]:
micro_poems_dict = micro_poems_csv.to_dict()

make a list of tweets

In [86]:
l = [value for key, value in micro_poems_dict["Tweet Text"].items()]

clean up the tweets

In [87]:
l2 = []
for tweet in l:
    tweet = tweet.replace("RT", "")
    tweet = ' '.join(re.sub("(\#[A-Za-z0-9_]+)|(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", tweet).split())
    tweet = ' '.join(re.sub("( tco[A-Za-z0-9]+)", "", tweet).split())
    tweet = tweet.replace("https", "")
    tweet = tweet.replace("http", "")
    tweet = tweet.lower()
    l2.append(tweet)

make a dictionary containing the tweets as keys and the number of occurrences as values

In [88]:
d = {}
for tweet in l2:
    if tweet not in d:
        d[tweet] = 1
    else:
        d[tweet] += 1

make a list of file names

In [89]:
l3 = [tup[0] for tup in s]

make a list of favorites tweet

In [90]:
l4 = [value for key, value in micro_poems_dict["Favorites"].items()]

make a list of top topics per tweet

In [91]:
l5 = []
for tup in s:
    l5.append(tup[1])

make a list of tuples, containing the top topic in a tweet and the number of favorites in that tweet

In [92]:
s2 = list(zip(l5, l4))

plot the favorites per topic

In [116]:
plt.scatter(*zip(*s2), color='g', s=30, marker='o', alpha=.4)
plt.xticks(np.arange(10))
#plt.yticks(np.arange(0, 1000))
plt.title('Favorites per Topic')
plt.xlabel('Topics')
plt.ylabel('Favorites')
plt.show()

Show the content of the topics

In [94]:
for t in range(len(topic_words)):
    print("Topic {}: {}".format(t, ' '.join(topic_words[t][:15])))

Topic 0: moon night stars leaves deep wind just light winter like away clouds day eyes heart
Topic 1: like morning wind stars autumn just away leaves rain words light clouds day deep eyes
Topic 2: sky clouds deep snow stars wind sun just autumn night rain old away leaves day
Topic 3: heart words just deep stars old snow winter day world light away clouds eyes leaves
Topic 4: sun winter snow clouds rain old world morning wind night away leaves eyes heart day
Topic 5: eyes away wind world leaves heart morning stars like clouds day deep just light moon
Topic 6: day night rain away autumn morning clouds leaves just world old deep sun sky snow
Topic 7: love old deep day heart just sun like moon away clouds eyes leaves light world
Topic 8: time leaves words clouds just old stars winter wind night world away day deep eyes
Topic 9: light world clouds morning words love away day deep eyes heart just leaves like moon


In [95]:
my_dpi = 227

Set the size of the plot

In [96]:
plt.figure(figsize=(32000/my_dpi, 4000/my_dpi), dpi=my_dpi)

<matplotlib.figure.Figure at 0x103f24cf8>

In [97]:
N, K = doctopic.shape
print(N, K)

1000 10


In [98]:
ind = np.arange(N)

In [99]:
width = 1

In [100]:
plots = []

In [101]:
height_cumulative = np.zeros(N)

plot the bars

In [102]:
for k in range(K):
    color = plt.cm.PiYG(k/K, 1)
    if k == 0:
        p = plt.bar(ind, doctopic[:, k], width, color=color)
    else:
        p = plt.bar(ind, doctopic[:, k], width, bottom=height_cumulative, color=color)
    height_cumulative += doctopic[:, k]
    plots.append(p)
    

In [103]:
plt.ylim((0, 1))

(0, 1)

label the y axis

In [104]:
plt.ylabel('Topics')

<matplotlib.text.Text at 0x11ad5da90>

label the x axis

In [105]:
plt.xlabel('Tweets')

<matplotlib.text.Text at 0x10ab20780>

give the plot a title

In [106]:
plt.title('Topics in tweets')

<matplotlib.text.Text at 0x11ac1afd0>

Use the filenames as tickmarks on the x axis

In [107]:
plt.xticks(ind+width/2, novel_names)

([<matplotlib.axis.XTick at 0x11b77d588>,
  <matplotlib.axis.XTick at 0x11ac357b8>,
  <matplotlib.axis.XTick at 0x1088776d8>,
  <matplotlib.axis.XTick at 0x11ac3f630>,
  <matplotlib.axis.XTick at 0x14e9d0048>,
  <matplotlib.axis.XTick at 0x14e9d0a58>,
  <matplotlib.axis.XTick at 0x14e9d44a8>,
  <matplotlib.axis.XTick at 0x14e9d4eb8>,
  <matplotlib.axis.XTick at 0x14e9d7908>,
  <matplotlib.axis.XTick at 0x14e9db358>,
  <matplotlib.axis.XTick at 0x14e9dbd68>,
  <matplotlib.axis.XTick at 0x14e9df7b8>,
  <matplotlib.axis.XTick at 0x14e9e3208>,
  <matplotlib.axis.XTick at 0x14e9e3c18>,
  <matplotlib.axis.XTick at 0x14e9e7668>,
  <matplotlib.axis.XTick at 0x14e9eb0b8>,
  <matplotlib.axis.XTick at 0x14e9ebac8>,
  <matplotlib.axis.XTick at 0x14e9ef518>,
  <matplotlib.axis.XTick at 0x14e9eff28>,
  <matplotlib.axis.XTick at 0x14e9f3978>,
  <matplotlib.axis.XTick at 0x14e9f63c8>,
  <matplotlib.axis.XTick at 0x14e9f6dd8>,
  <matplotlib.axis.XTick at 0x14e9fc828>,
  <matplotlib.axis.XTick at 0x14eb

turn the tickmarks to be vertical

In [108]:
plt.xticks(rotation=90)

(array([  5.00000000e-01,   1.50000000e+00,   2.50000000e+00,
          3.50000000e+00,   4.50000000e+00,   5.50000000e+00,
          6.50000000e+00,   7.50000000e+00,   8.50000000e+00,
          9.50000000e+00,   1.05000000e+01,   1.15000000e+01,
          1.25000000e+01,   1.35000000e+01,   1.45000000e+01,
          1.55000000e+01,   1.65000000e+01,   1.75000000e+01,
          1.85000000e+01,   1.95000000e+01,   2.05000000e+01,
          2.15000000e+01,   2.25000000e+01,   2.35000000e+01,
          2.45000000e+01,   2.55000000e+01,   2.65000000e+01,
          2.75000000e+01,   2.85000000e+01,   2.95000000e+01,
          3.05000000e+01,   3.15000000e+01,   3.25000000e+01,
          3.35000000e+01,   3.45000000e+01,   3.55000000e+01,
          3.65000000e+01,   3.75000000e+01,   3.85000000e+01,
          3.95000000e+01,   4.05000000e+01,   4.15000000e+01,
          4.25000000e+01,   4.35000000e+01,   4.45000000e+01,
          4.55000000e+01,   4.65000000e+01,   4.75000000e+01,
        

In [109]:
plt.yticks(np.arange(0, 1, 10))

([<matplotlib.axis.YTick at 0x11ac299e8>],
 <a list of 1 Text yticklabel objects>)

Make a legend containing each topic

In [110]:
topic_labels = ['Topic #{}'.format(k) for k in range(K)]

In [111]:
plt.legend([p[0] for p in plots], topic_labels)

<matplotlib.legend.Legend at 0x14f7cf780>

save the plot

In [112]:
plt.savefig('my_fig2.png', dpi=my_dpi)