In [1]:
from __future__ import division, print_function, absolute_import
from past.builtins import basestring

import os
import gzip

import pandas as pd

from twip.constant import DATA_PATH

from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [114]:
import matplotlib
from IPython.display import display, HTML 
%matplotlib inline
np = pd.np
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.set_option('precision', 2)
%precision 4
%pprint

Pretty printing has been turned ON


In [41]:
from collections import OrderedDict
from gensim.models import LsiModel

Load cleaned tweet data  
Don't forget to fix up the tokens!  
Can you think of a better way to save a list of lists of strings?
What about the raw, unprocessed unicode tweet text itself?

In [9]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    corpus = pd.DataFrame.from_csv(f, encoding='utf8')
corpus['tokens'] = corpus.txt.str.split()
vocab = Dictionary.from_documents(corpus.tokens)
corpus.tokens

87        [python, never, stop, learning, what, you, enj...
88                              [Watching, Boa, vs, Python]
90          [Monty, Python, The, silly, walk, via, YouTube]
                                ...                        
193375    [RT, RealPython, List, of, Python, API, Wrappe...
193376                          [Watching, Boa, vs, Python]
193377              [IT, Digital, Go, Senior, Python, Djan]
Name: tokens, dtype: object

Now load previously compiled vocabulary and TFIDF matrix (transformation)

In [6]:
tfidf = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))
tfidf.num_docs

183070

In [11]:
bows = pd.Series(vocab.doc2bow(toks) for toks in corpus.tokens)
bows

0         [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                        [(8, 1), (9, 1), (10, 1), (11, 1)]
2         [(8, 1), (12, 1), (13, 1), (14, 1), (15, 1), (...
                                ...                        
183067    [(2, 1), (8, 1), (53, 1), (236, 1), (298, 2), ...
183068                   [(8, 1), (9, 1), (10, 1), (11, 1)]
183069    [(8, 1), (25, 1), (652, 1), (1669, 1), (13166,...
dtype: object

This would make a nice, compact sparse matrix representation of our entire corpus...  
Which would mean we could do more in RAM at once.  
Left as an exercise.  (check out `scipy.sparse.coo_matrix`)  

In [12]:
tfidf[bows[0]]

[(0, 0.5289848018821018),
 (1, 0.31507262736732733),
 (2, 0.08241061852176369),
 (3, 0.40331646781409985),
 (4, 0.4246263948093858),
 (5, 0.3819328565882172),
 (6, 0.29103061240046285),
 (7, 0.20141880781540905)]

In [14]:
dict([(vocab[i], freq) for i, freq in tfidf[bows[0]]])

{u'doing': 0.3819328565882172,
 u'enjoy': 0.5289848018821018,
 u'learning': 0.29103061240046285,
 u'never': 0.40331646781409985,
 u'python': 0.08241061852176369,
 u'stop': 0.4246263948093858,
 u'what': 0.31507262736732733,
 u'you': 0.20141880781540905}

Notice how "you" didn't get as much weight as "enjoy"  
Let's look at some other tweets  

This is starting to look a lot like a set of vectors that we could use as features  
But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have?

In [15]:
len(vocab)

86901

100k dimensions isn't a good idea  
Even for a masively parallel deep learning project this would be big  
Like the cat/dog picture classification on 256x256 images  
What about PCA (Principal Component Analysis) like is used on images?  
In NLP PCA is called LSI (Latent Semantic Analysis)  
That sounds cool!  
I want me some latent semantics (hidden meaning)  

In [24]:
lsi = LsiModel(bows, num_topics=100, id2word=vocab, extra_samples=100, power_iters=2)
lsi

<gensim.models.lsimodel.LsiModel at 0x7f4fec519310>

## Wow!  
What happened to the **GIL**?  
What's that sound I hear?  
That's the sound of your fans blowing *hot air* out of those tweets!  
(check out your system monitor or `htop`)  
Can Python do that?  
With `numpy` and `gensim` you can.  

In [37]:
tweetids = pd.Series(range(6), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
pd.DataFrame([pd.Series([x[1] for x in lsi[bows[i]]], index=topicids,
                        name='tweet') for i in tweetids],
             index=tweetids)


topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,0.280139,0.645104,-0.156941,-0.328108,0.015252,-0.01448,0.48422,-0.153653,0.240989,0.06208,-0.118877,-0.094675,-0.135543,0.12209,0.193506,0.003097,0.066766,-0.026772,0.2496,0.693237,0.420713,0.104192,0.349105,-0.056677,0.103683,-0.033816,-0.015701,0.012509,-0.085577,-0.128758,-0.137254,-0.002252,0.092052,0.018715,-0.113647,-0.124823,0.009918,0.004717,0.115847,0.025692,0.013284,-0.053,-0.047627,0.049995,0.058959,-0.02688,0.037795,0.009598,-0.018883,-0.058684,-0.096751,-0.018933,-0.047766,-0.007974,-0.050381,0.060431,0.090212,-0.064182,-0.049951,-0.014272,0.105264,0.008199,0.044244,0.054087,0.009008,0.03173,-0.005846,0.029153,-0.070785,-0.030846,-0.06091,-0.050586,-0.005268,-0.03016,-0.007984,0.034473,0.044934,-0.065762,-0.024994,-0.055869,-0.008236,-0.030237,-0.062698,0.080212,0.057435,-0.119406,0.030398,-0.083193,-0.06403,-0.052019,-0.123995,-0.126556,-0.043337,0.14079,0.288244,0.140655,-0.111475,-0.059658,0.170123,0.005351
1,0.663569,-0.580618,0.14851,0.057251,-0.036703,0.000227,0.173303,-0.126898,0.056051,-0.015301,-0.250163,-0.067947,-0.171251,0.010556,0.061445,-0.080825,-0.082699,0.006482,-0.091692,0.033222,0.072212,0.033618,0.046171,-0.024242,-0.082662,-0.012842,0.027237,-0.022736,0.036627,-0.000471,0.034782,0.054313,-0.012812,-0.015476,0.033136,-0.047485,-0.018584,-0.016457,0.010772,-0.002369,-0.009152,-0.034558,0.03839,0.085857,0.028539,-0.01529,0.04665,-0.014755,-0.030558,-0.024755,-0.020742,-0.00432,-0.052452,0.015888,0.017784,0.074077,0.023654,-0.028288,-0.004443,0.059616,0.003974,-0.040173,-0.069966,0.02271,0.051112,-0.057466,0.027915,-0.021728,-0.020873,-0.008914,-0.037338,0.047209,0.079282,-0.009219,-0.002034,0.010092,0.087747,-0.085667,0.014027,0.005552,0.008376,0.043964,0.101184,-0.052996,-0.052001,0.084054,-0.024419,0.043602,0.045963,0.03798,-0.002213,0.01054,-0.043774,0.05455,0.064821,0.07164,0.024205,0.031249,0.043401,0.001182
2,0.727224,-0.587829,0.145266,0.082313,-0.056999,-0.010716,0.201449,-0.158037,0.060859,-0.023182,-0.337693,-0.065579,-0.240846,0.005329,0.114057,-0.036656,-0.088168,-0.068094,-0.166463,-0.051538,0.066112,0.053491,0.043295,-0.243268,-0.088203,-0.026662,-0.029493,-0.009399,-0.179283,-0.379431,-0.014169,-0.229452,0.09574,0.345982,0.188485,0.453206,0.057274,-0.006364,0.157342,-0.035726,-0.09039,-0.015862,-0.02164,0.37567,-0.129401,0.136548,0.285673,-0.55784,0.549952,0.394829,-0.048429,0.049683,-0.111485,-0.083458,0.115329,-0.0891,0.032392,-0.102044,-0.223943,-0.02926,-0.182369,-0.069372,0.24468,-0.015348,-0.015912,-0.164996,-0.04113,0.108198,0.121972,-0.106334,-0.005794,0.036762,0.145916,-0.057413,-0.033213,0.003218,0.129597,0.057509,0.049163,0.000206,-0.002933,0.003491,0.04324,-0.030267,-0.00744,0.034838,-0.012017,-0.017225,0.059216,0.041939,0.067635,0.046866,0.033133,-0.011971,-0.065532,0.053997,0.086549,0.029556,-0.004894,-0.053187
3,0.875457,-1.042743,0.488332,-1.032599,-0.028926,-0.088792,0.36048,-0.505431,0.807128,-0.038106,1.071811,0.706922,0.223021,-0.100974,-0.10547,0.183228,0.086609,-0.028132,-0.282898,0.052487,0.002725,-0.09269,0.117259,0.188275,0.132318,0.056519,0.207372,-0.558728,0.030658,-0.131165,0.026875,-0.09465,0.089053,-0.173702,0.007949,0.083884,-0.013705,-0.002258,0.080123,0.107888,0.013475,0.103968,0.201596,0.087961,0.230374,-0.184477,0.00859,-0.005375,0.162359,0.074545,0.230756,-0.406059,0.081852,-0.163109,-0.034361,-0.110405,-0.194789,-0.016208,-0.012769,-0.2528,-0.197962,-0.112005,-0.144892,-0.229302,0.665528,0.002939,0.314609,0.263759,-0.645102,0.078154,0.459212,-0.066045,0.020002,-0.119575,-0.047362,-0.214908,-0.219793,-0.141442,-0.126087,-0.013056,-0.078938,-0.062039,0.058489,0.189062,-0.164434,0.083706,-0.147333,0.046674,0.064553,0.017572,0.008736,-0.016765,0.090212,-0.108353,-0.129246,-0.06609,0.01416,-0.048477,0.004659,-0.002276
4,1.085393,-0.961083,0.553632,-1.654039,0.092107,-0.092963,-0.031262,-0.076556,0.322449,-0.02031,0.780237,0.53243,0.241439,-0.180711,-0.106484,0.258467,0.175888,-0.165265,-0.301821,0.07129,0.03896,-0.028647,-0.012477,0.063274,-0.067628,0.026229,0.01338,0.177704,-0.021402,0.04483,-0.146017,0.127523,0.704273,-0.583477,0.139959,0.462977,-0.129922,0.114878,-0.048716,0.050534,0.029932,-0.029896,-0.314825,0.277107,-0.150986,0.024417,0.046659,0.019545,-0.129028,-0.030375,-0.099479,-0.409103,-0.073012,-0.061145,-0.095816,-0.003332,-0.011432,-0.012317,0.060218,0.004766,-0.214938,-0.173836,-0.013394,-0.175989,0.183724,0.041579,0.210948,0.098331,-0.464815,0.199904,0.536529,0.086921,0.071117,-0.37608,-0.037405,0.02359,-0.011132,0.271936,-0.034991,0.077127,0.095204,0.00891,-0.379068,-0.140698,0.458188,-0.044234,0.210989,0.059815,0.138436,-0.14741,0.127216,0.289047,-0.296151,-0.042381,0.151787,0.168267,0.018124,0.042086,0.129577,-0.072049
5,0.656938,-0.574603,0.146813,0.053926,-0.033427,-0.000832,0.16778,-0.122052,0.057246,-0.017876,-0.240419,-0.066266,-0.16245,0.009803,0.059451,-0.074132,-0.077193,0.009668,-0.083206,0.03222,0.069265,0.031161,0.043594,-0.010198,-0.079931,-0.008934,0.029677,-0.016819,0.029361,-0.011774,0.037521,0.053045,-0.011762,-0.018392,0.038105,-0.043733,-0.028886,-0.002504,-0.001939,0.003034,-0.001858,-0.021138,0.020251,0.052533,0.026948,-0.038235,0.020695,0.005921,-0.065322,-0.027712,-0.012881,-0.006024,-0.033431,0.024268,-0.023939,0.007773,0.012667,-0.007486,0.010007,-0.003921,0.016073,0.007297,0.004589,-0.007618,0.00196,0.005905,0.01612,-0.005583,-0.003529,-0.016204,-0.010102,0.000648,-0.022789,0.006774,0.004878,0.005208,0.014923,-0.013383,-0.005554,0.007377,-0.008319,0.003495,-0.007747,-0.019106,0.005719,-0.00806,0.004394,-0.020992,-0.004777,-0.008545,-0.006207,0.002229,-0.001683,3e-06,0.021443,-0.000575,-0.001628,-0.009661,-0.003669,-0.008736


In [38]:
lsi2 = LsiModel(bows, num_topics=2, id2word=vocab, extra_samples=100, power_iters=2)
lsi2

<gensim.models.lsimodel.LsiModel at 0x7f4fc77d8d50>

In [39]:
lsi.save(os.path.join(DATA_PATH, 'lsi100'))
lsi2.save(os.path.join(DATA_PATH, 'lsi2'))

## This will take lots of RAM!
(and lots of CPU time)

In [45]:
tweetids = pd.Series(range(len(bows)), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
# `dict()` keeps track of the columns for each topic, in case the lsi model shuffles or skips topics for odd tweets
df = pd.DataFrame([pd.Series(dict(lsi[bows[i]]), name='tweet') for i in tweetids],
                  columns=topicids,
                  index=tweetids)
df

topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,0.280144,0.645105,-0.156975,-0.328116,0.015318,-0.014497,0.484102,-0.153773,0.241131,0.062283,-0.118761,-0.094960,-0.135531,0.122059,0.193513,0.003106,0.066545,-0.026250,0.248931,0.693017,0.421668,0.102264,0.350137,-0.056190,0.103494,-0.033260,-0.015738,0.011657,-0.084498,-0.128975,-0.136362,0.000170,0.093105,0.018023,-0.108551,0.129264,0.009799,0.004326,0.114468,0.023980,0.008729,-0.056863,-0.035483,0.055620,0.062181,-0.026902,0.037192,0.000747,-0.022275,-0.055226,-0.102552,-0.018423,-0.056337,-0.009531,-0.049445,0.065178,0.091166,-0.064567,-0.064973,-0.018463,0.094897,-0.008599,0.019477,0.062401,0.013377,0.041026,-0.008740,0.044299,-0.029556,-0.049713,-0.030349,-0.068517,-0.027192,-0.006531,0.019705,-0.065000,0.030005,0.007625,-0.080601,-0.002059,-0.032288,-0.049513,0.074121,0.066816,-0.014540,-0.132510,0.033808,-0.139341,-0.052048,0.065339,0.043482,0.106395,0.047561,-0.161982,0.197318,0.053055,-0.040181,-0.083893,0.146351,0.010187
1,0.663570,-0.580630,0.148501,0.057253,-0.036710,0.000174,0.173357,-0.126965,0.056006,-0.015262,-0.249953,-0.068057,-0.171501,0.010628,0.061573,-0.080714,-0.082887,0.006699,-0.091624,0.032873,0.072354,0.033410,0.045845,-0.024214,-0.082310,-0.013089,0.026983,-0.022289,0.036957,0.000029,0.036040,0.053721,-0.012822,-0.013610,0.033954,0.046664,-0.017004,-0.015192,0.008758,-0.001443,-0.015368,-0.035635,0.052592,0.078674,0.030055,-0.015463,0.043441,-0.017615,-0.032483,-0.019155,-0.020012,-0.008983,-0.050586,0.021321,0.018497,0.082568,0.015520,-0.026224,-0.001431,0.057947,0.001347,-0.050355,-0.078762,0.002389,0.050469,-0.049463,0.036650,-0.007728,-0.036675,0.000430,-0.060819,0.029158,0.069703,0.013437,-0.021007,-0.034202,0.085163,-0.024758,-0.081305,-0.008410,0.029201,0.018218,-0.111842,-0.019104,-0.019719,0.068597,0.000565,0.056609,0.009593,-0.007164,0.014913,-0.006359,0.007167,-0.033844,0.096978,-0.038695,0.028121,0.044775,0.038816,0.000530
2,0.727227,-0.587833,0.145264,0.082303,-0.057071,-0.010749,0.201366,-0.158067,0.060876,-0.023129,-0.337637,-0.065547,-0.240782,0.005638,0.113883,-0.036672,-0.088166,-0.067750,-0.166287,-0.051547,0.066189,0.053383,0.043731,-0.243602,-0.088445,-0.026679,-0.029388,-0.010079,-0.177108,-0.382501,-0.013150,-0.228991,0.098863,0.344843,0.171488,-0.457709,0.051729,-0.004786,0.151546,-0.021764,-0.109962,-0.030967,0.023615,0.374311,-0.139940,0.186587,0.204501,-0.581888,0.575046,0.343630,-0.033065,0.108308,-0.122216,-0.058618,0.120870,-0.079709,0.035515,-0.084781,-0.200432,0.005304,-0.196750,-0.048127,0.259984,0.043014,0.005473,-0.170177,-0.056623,-0.002382,0.112717,-0.120081,-0.071637,0.023795,0.164140,-0.028127,-0.015133,-0.010630,0.130387,-0.002740,0.057415,0.020022,-0.016896,0.014753,-0.082077,0.004906,0.019808,0.028929,-0.043757,0.056939,0.053688,-0.051604,-0.021736,-0.044406,-0.006870,0.022344,-0.012910,-0.047105,0.098418,0.055315,0.002358,-0.031146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183067,1.428605,0.121010,-0.840434,-0.152054,-0.252031,-0.156024,0.272005,-0.583447,0.480997,-0.183614,-1.242581,0.630944,1.153471,0.494766,-0.622896,0.138267,0.126339,0.191016,-0.049388,0.051271,-0.015057,0.224716,0.205355,-0.031782,-0.083087,-0.182527,0.089462,0.036077,0.058125,-0.065749,0.079825,0.147006,0.015451,0.064574,0.043633,0.103609,-0.042706,-0.025172,0.064304,0.095637,-0.086747,0.059488,0.003819,0.157804,0.052368,0.017636,0.088845,0.037923,-0.091868,-0.053689,0.001005,0.011375,0.033408,0.013618,-0.080671,0.041062,0.033183,-0.074557,0.009182,-0.054656,-0.022227,-0.014109,0.013096,-0.016909,0.049667,-0.030288,0.053564,-0.019326,0.064831,0.065503,-0.046783,0.034994,-0.045599,0.009562,0.052121,0.038168,0.046028,0.031966,-0.059828,-0.078121,0.002367,0.001220,0.081065,-0.041551,0.005396,0.032510,0.072496,0.024512,0.023465,0.021734,-0.003794,-0.041177,0.044056,-0.052110,0.026002,0.002445,-0.063529,-0.036062,-0.058603,0.077038
183068,0.663570,-0.580630,0.148501,0.057253,-0.036710,0.000174,0.173357,-0.126965,0.056006,-0.015262,-0.249953,-0.068057,-0.171501,0.010628,0.061573,-0.080714,-0.082887,0.006699,-0.091624,0.032873,0.072354,0.033410,0.045845,-0.024214,-0.082310,-0.013089,0.026983,-0.022289,0.036957,0.000029,0.036040,0.053721,-0.012822,-0.013610,0.033954,0.046664,-0.017004,-0.015192,0.008758,-0.001443,-0.015368,-0.035635,0.052592,0.078674,0.030055,-0.015463,0.043441,-0.017615,-0.032483,-0.019155,-0.020012,-0.008983,-0.050586,0.021321,0.018497,0.082568,0.015520,-0.026224,-0.001431,0.057947,0.001347,-0.050355,-0.078762,0.002389,0.050469,-0.049463,0.036650,-0.007728,-0.036675,0.000430,-0.060819,0.029158,0.069703,0.013437,-0.021007,-0.034202,0.085163,-0.024758,-0.081305,-0.008410,0.029201,0.018218,-0.111842,-0.019104,-0.019719,0.068597,0.000565,0.056609,0.009593,-0.007164,0.014913,-0.006359,0.007167,-0.033844,0.096978,-0.038695,0.028121,0.044775,0.038816,0.000530
183069,0.680662,-0.605683,0.167900,-0.019639,-0.034689,-0.004619,0.187667,-0.151763,0.104149,-0.019943,-0.158143,-0.022511,-0.132447,0.004930,0.056535,-0.064720,-0.086859,0.010487,-0.026774,0.022466,0.063052,0.026204,0.045837,0.005248,-0.074017,-0.012713,0.042793,-0.068897,0.034608,-0.036801,0.064095,0.004729,-0.017313,-0.045031,0.019206,0.046469,0.023254,-0.021212,0.051016,0.013573,0.006448,0.048163,0.206148,-0.026739,0.167889,-0.115463,-0.011744,-0.027240,0.027768,-0.044605,0.119967,0.032652,0.045772,-0.043024,0.009572,-0.083468,-0.086269,0.039791,0.017357,-0.010081,0.081401,0.029835,0.106293,-0.011296,-0.083627,-0.016645,0.008425,-0.073941,0.034476,0.010844,-0.029009,0.010603,0.046959,0.059230,0.001938,-0.024817,0.084405,0.090076,0.012871,0.006282,-0.000896,0.002320,0.045428,-0.016945,0.057710,0.034616,0.023691,0.026185,0.040644,-0.005422,-0.008773,-0.027458,-0.012143,0.026107,0.071456,0.026830,-0.054879,0.031072,0.110656,-0.022735


We built LSI topic vectors for 200k tweets in a few minutes!  
Lets look at the TFIDF vectors for the top 6 tweets

In [67]:
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6)))
tfidf6 = tfidf6.fillna('')
tfidf6

Unnamed: 0,And,Architect,Boa,Django,Engineer,Full,Jobs,Manhattan,Monty,NY,Php,Platform,Python,Senior,Software,Solr,Stack,The,Watching,With,YouTube,doing,enjoy,in,inevitability,jobs,jobsearch,k,learning,never,peaceful,python,rain,silly,stop,via,vs,walk,what,you
0,,,,,,,,,,,,,,,,,,,,,,0.38,0.53,,,,,,0.29,0.4,,0.082,,,0.42,,,,0.32,0.2
1,,,0.62,,,,,,,,,,0.035,,,,,,0.67,,,,,,,,,,,,,,,,,,0.4,,,
2,,,,,,,,,0.6,,,,0.028,,,,,0.21,,,0.28,,,,,,,,,,,,,0.51,,0.22,,0.46,,
3,0.35,,,0.31,0.24,0.34,0.23,,,,0.42,,0.033,0.3,0.28,,0.34,,,,,,,,,0.21,0.23,,,,,,,,,,,,,
4,,0.25,,0.17,0.13,,0.12,0.66,,0.39,,0.24,0.018,,,0.35,,,,0.17,,,,0.064,,0.12,0.13,0.2,,,,,,,,,,,,
5,,,,,,,,,,,,,0.029,,,,,,,,,,,,0.59,,,,,,0.54,,0.6,,,,,,,


Notice the small weights on the word "Python"?
Why do you think that is?
(Think back to the definition of TF and DF and TFIDF

Now lets see how far apart they are based only on word frequency (TFIDF)
We'll *"project"* the first tweet onto the second with a dot product  
to see how much of a "shadow" they make on each other  

In [80]:
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6))).fillna(0).T

In [112]:
np.dot(tfidf6[0], tfidf6[1])

0.0

In [113]:
np.dot(tfidf6[1], tfidf6[2])

0.00097238386155234309

That looks about right.  
The first 2 share no words.  
The second 2 share only "Python".  
But lets do the cosine similarity correctly by normalizing for length.  

In [115]:
np.dot(tfidf6[1], tfidf6[2]) / np.linalg.norm(tfidf6[1]) / np.linalg.norm(tfidf6[2])

0.0010

Hmmm, nothing changed  
Can you guess why?  

In [116]:
[round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)]

[0.0000, 0.0010, 0.0009, 0.1673, 0.0005]

In [None]:
Now lets look at the topic vectors.  


In [125]:
df.iloc[:6]

topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,0.28,0.65,-0.16,-0.33,0.02,-0.0145,0.48,-0.15,0.24,0.06,-0.12,-0.09,-0.14,0.122,0.19,0.00311,0.07,-0.0262,0.25,0.69,0.422,0.1,0.35,-0.06,0.1,-0.0333,-0.02,0.01,-0.08,-0.129,-0.14,0.00017,0.09,0.02,-0.109,0.13,0.0098,0.00433,0.114,0.024,0.00873,-0.06,-0.04,0.06,0.06,-0.03,0.0372,0.000747,-0.02,-0.06,-0.1,-0.0184,-0.06,-0.00953,-0.05,0.0652,0.0912,-0.0646,-0.065,-0.0185,0.0949,-0.0086,0.0195,0.0624,0.0134,0.041,-0.00874,0.0443,-0.0296,-0.0497,-0.0303,-0.0685,-0.03,-0.00653,0.0197,-0.065,0.03,0.00762,-0.08,-0.00206,-0.0323,-0.05,0.0741,0.0668,-0.0145,-0.13,0.0338,-0.14,-0.052,0.0653,0.0435,0.106,0.0476,-0.162,0.2,0.05,-0.0402,-0.0839,0.146,0.0102
1,0.66,-0.58,0.15,0.06,-0.04,0.000174,0.17,-0.13,0.06,-0.02,-0.25,-0.07,-0.17,0.0106,0.06,-0.0807,-0.08,0.0067,-0.09,0.03,0.0724,0.03,0.05,-0.02,-0.08,-0.0131,0.03,-0.02,0.04,2.87e-05,0.04,0.0537,-0.01,-0.01,0.034,0.05,-0.017,-0.0152,0.00876,-0.00144,-0.0154,-0.04,0.05,0.08,0.03,-0.02,0.0434,-0.0176,-0.03,-0.02,-0.02,-0.00898,-0.05,0.0213,0.02,0.0826,0.0155,-0.0262,-0.00143,0.0579,0.00135,-0.0504,-0.0788,0.00239,0.0505,-0.0495,0.0367,-0.00773,-0.0367,0.00043,-0.0608,0.0292,0.07,0.0134,-0.021,-0.0342,0.0852,-0.0248,-0.08,-0.00841,0.0292,0.02,-0.112,-0.0191,-0.0197,0.07,0.000565,0.06,0.00959,-0.00716,0.0149,-0.00636,0.00717,-0.0338,0.1,-0.04,0.0281,0.0448,0.0388,0.00053
2,0.73,-0.59,0.15,0.08,-0.06,-0.0107,0.2,-0.16,0.06,-0.02,-0.34,-0.07,-0.24,0.00564,0.11,-0.0367,-0.09,-0.0677,-0.17,-0.05,0.0662,0.05,0.04,-0.24,-0.09,-0.0267,-0.03,-0.01,-0.18,-0.383,-0.01,-0.229,0.1,0.34,0.171,-0.46,0.0517,-0.00479,0.152,-0.0218,-0.11,-0.03,0.02,0.37,-0.14,0.19,0.205,-0.582,0.58,0.34,-0.03,0.108,-0.12,-0.0586,0.12,-0.0797,0.0355,-0.0848,-0.2,0.0053,-0.197,-0.0481,0.26,0.043,0.00547,-0.17,-0.0566,-0.00238,0.113,-0.12,-0.0716,0.0238,0.16,-0.0281,-0.0151,-0.0106,0.13,-0.00274,0.06,0.02,-0.0169,0.01,-0.0821,0.00491,0.0198,0.03,-0.0438,0.06,0.0537,-0.0516,-0.0217,-0.0444,-0.00687,0.0223,-0.01,-0.05,0.0984,0.0553,0.00236,-0.0311
3,0.88,-1.04,0.49,-1.03,-0.03,-0.0888,0.36,-0.51,0.81,-0.04,1.07,0.71,0.22,-0.101,-0.11,0.182,0.09,-0.028,-0.28,0.05,0.00365,-0.09,0.12,0.19,0.13,0.0554,0.21,-0.56,0.03,-0.131,0.03,-0.0944,0.09,-0.17,0.00643,-0.09,-0.0147,-0.00197,0.0697,0.0963,0.0331,0.1,0.21,0.06,0.24,-0.18,0.00301,-0.0187,0.21,0.1,0.19,-0.398,0.08,-0.161,-0.03,-0.141,-0.185,-0.0522,-0.00744,-0.239,-0.192,-0.0978,-0.0408,-0.338,0.642,0.101,0.232,0.654,-0.312,0.0234,0.416,0.0278,0.1,-0.134,-0.00544,0.238,-0.19,0.0362,-0.23,-0.0757,-0.115,0.02,-0.0838,0.126,-0.172,0.12,-0.153,0.06,0.0482,-0.0188,0.0483,0.0319,-0.0305,0.115,-0.16,0.02,0.0343,-0.0469,-0.0353,-0.00431
4,1.09,-0.96,0.55,-1.65,0.09,-0.0928,-0.03,-0.08,0.32,-0.02,0.78,0.53,0.24,-0.181,-0.11,0.258,0.18,-0.165,-0.3,0.07,0.039,-0.03,-0.01,0.06,-0.07,0.0289,0.02,0.18,-0.02,0.0417,-0.14,0.141,0.7,-0.59,0.129,-0.47,-0.133,0.11,-0.0421,0.0507,0.0391,-0.05,-0.26,0.32,-0.14,0.01,0.0716,0.0508,-0.11,0.02,-0.13,-0.392,-0.09,-0.0611,-0.08,-0.00397,-0.00701,-0.0164,0.0866,0.0142,-0.228,-0.117,0.0589,-0.194,0.192,0.106,0.189,0.436,-0.277,0.168,0.437,0.152,0.26,-0.413,-0.0136,-0.0299,0.00508,0.112,0.2,0.139,0.152,-0.13,0.5,-0.201,0.321,-0.02,0.314,0.07,0.159,0.131,-0.098,-0.218,-0.0798,0.0831,0.24,-0.1,0.0434,0.0395,0.193,-0.0391
5,0.66,-0.57,0.15,0.05,-0.03,-0.000851,0.17,-0.12,0.06,-0.02,-0.24,-0.07,-0.16,0.00986,0.06,-0.074,-0.08,0.00962,-0.08,0.03,0.0694,0.03,0.04,-0.01,-0.08,-0.00879,0.03,-0.02,0.03,-0.0113,0.04,0.0525,-0.01,-0.02,0.0391,0.04,-0.0284,-0.00263,-0.00317,0.00173,-0.00328,-0.02,0.03,0.05,0.03,-0.04,0.023,0.00261,-0.07,-0.02,-0.01,-0.0105,-0.03,0.0239,-0.02,0.00869,0.0135,-0.00708,0.00732,-0.00521,0.0181,0.00469,0.00564,-0.00516,-0.000606,0.00493,0.0167,-0.00316,-0.00326,-0.017,-0.00187,0.000605,-0.03,0.00569,0.00501,-0.0068,0.0111,0.00574,-0.01,0.0145,0.00109,-0.01,-0.00212,-0.0136,0.00886,-0.01,0.00595,-0.02,0.00265,0.00724,-0.00299,-0.00286,0.00329,-0.00338,0.02,0.01,-0.00195,-0.00926,0.00224,-0.00817


In [122]:
print([round(np.dot(df.T[i], df.T[i+1]), 4) for i in range(5)])

[0.0105, 1.1037, 0.9981, 6.452, 1.0153]


Better normalize these...

In [123]:
print([round(np.dot(df.T[i], df.T[i+1]) / np.linalg.norm(df.T[i]) / np.linalg.norm(df.T[i+1]), 4) for i in range(5)])
# for comparison the TFIDF scores right below
print([round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)])

[0.0066, 0.5742, 0.1849, 0.6925, 0.325]
[0.0, 0.001, 0.0009, 0.1673, 0.0005]


So the really chummy neighbors are 1 & 2 and 3 & 4  
Surprisingly 2 & 3 didn't hit it off, and no pairing got a zero!   
And the last 2 seem to share a "latent" similarity that TFIDF missed entirely!!!
And LSI picked up on the python<->Python similarity (tweets 0 and 1)

In [133]:
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    text = pd.DataFrame.from_csv(f, encoding='utf8')

In [136]:
for s in text.txt.iloc[:6]:
    print(s)

 python never stop learning what you enjoy doing 
Watching Boa vs Python 
Monty Python The silly walk via YouTube
Senior Software Engineer Full Stack Python Django And Php Jobs jobs jobsearch 
Architect Django Solr Platform Engineer With Python k Jobs in Manhattan NY Manhattan NY jobs jobsearch 
peaceful rain Python inevitability


What about a new tweet you are considering?  
Notice how I changed the token spelling (BOW),  
but not the *"semantics"* of the tweet.  

In [169]:
tweet = 'I want to help build django with a job in Chicago'
tweet_bow = vocab.doc2bow(tweet.split())
tweet_tfidf = tfidf[tweet_bow]
tweet_topics = pd.Series(dict(lsi[tweet_tfidf]))
# Now that the math is done let's convert to a friendlier format with words as the keys/index
tweet_tfidf = pd.Series(dict([(vocab[i], x) for (i, x) in tweet_tfidf])) 
print('\nLSI Topic Vector')
tweet_topics


LSI Topic Vector


0     1.41e-01
1     1.18e-01
2     1.45e-01
        ...   
97   -8.82e-03
98    2.28e-02
99   -5.27e-03
dtype: float64

Compare the topic vector above to the TFIDF vector below.  
What's better about TFIDF compared to topic vectors?  
What can we do about it?  

In [170]:
print('TFIDF Frequency Vector')
print(tweet_tfidf)

TFIDF Frequency Vector
Chicago    0.45
I          0.18
a          0.13
           ... 
to         0.12
want       0.37
with       0.16
dtype: float64


Which one is it closest too?  
Can you guess?  
Does LSI understand the words as well as you do?  

In [167]:
print('LSI Topic Similarity')
print([round(np.dot(df.T[i], tweet_topics) / np.linalg.norm(df.T[i]) / np.linalg.norm(tweet_topics), 4) for i in range(6)])

LSI Topic Similarity
[0.0716, -0.014, 0.0025, 0.0716, 0.1484, -0.003]


In [184]:
tfidf7 = tfidf6.copy()
tfidf7[6] = tweet_tfidf
tfidf7 = tfidf7.fillna(0)
tfidf7

Unnamed: 0,0,1,2,3,4,5,6
And,0.00,0.00,0.00,0.35,0.00,0.0,0.0
Architect,0.00,0.00,0.00,0.00,0.25,0.0,0.0
Boa,0.00,0.62,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...
walk,0.00,0.00,0.46,0.00,0.00,0.0,0.0
what,0.32,0.00,0.00,0.00,0.00,0.0,0.0
you,0.20,0.00,0.00,0.00,0.00,0.0,0.0


In [186]:
print([round(np.dot(tfidf7[i], tfidf7[6]), 4) for i in range(6)])

[0.0, 0.0, 0.0, 0.0, 0.0076, 0.0]


In [187]:
tweet

'I want to help build django with a job in Chicago'

Can you find the one word I accidentally share with the other tweets?  
*Hint: use the TFIDF matrix (Dataframe)*  
Play around with the tweet text to make its topic vector more *"orthogonal"*  
Or make it closer in cosine distance.  