### In this notebook, the data used is the tokenized version tweets from 2015. 

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from time import time

In [2]:
tweets1 = pd.read_csv("output_2015_06.csv",index_col=False)
tweets1.drop('Unnamed: 0',axis = 1, inplace = True)

In [3]:
token_15_06 = pd.read_csv("small_2015-06.csv", header=None, names=['Tweet ID', 'Tokenized_tweets'])

In [8]:
token_15_06.head()

Unnamed: 0,Tweet ID,Tokenized_tweets
0,611053325644005376,she say unenthused
1,611162865551192064,for ailment sun remedy none if try find
2,611058983927836673,yeah rest conditioning summer
3,611165766952591360,remind blessed i walk office today
4,611159320655347712,unarmed white teen shot black officer


In [4]:
len(token_15_06)

6615177

In [5]:
token_15_11 = pd.read_csv("small_2015-11.csv", header=None, names=['Tweet ID', 'Tokenized_tweets'])

In [12]:
token_15_11.head()

Unnamed: 0,Tweet ID,Tokenized_tweets
0,667224578763067392,christmas tree s like s mansion
1,667221463523581952,kenneth whalum special guests zoocru
2,667224289947590656,us 129 tail of the dragon
3,667233408611975168,i wish i
4,667228363631775744,i be amp sushi charlotte nc


In [6]:
len(token_15_11) + len(token_15_06)

9255633

In [7]:
# THis time we put all 2015 data to our NMF model. There are 9,255,633 tweets in total.
token_15 =  pd.concat([token_15_06, token_15_11])

In [8]:
len(token_15)

9255633

In [9]:
token_15.head()

Unnamed: 0,Tweet ID,Tokenized_tweets
0,611053325644005376,she say unenthused
1,611162865551192064,for ailment sun remedy none if try find
2,611058983927836673,yeah rest conditioning summer
3,611165766952591360,remind blessed i walk office today
4,611159320655347712,unarmed white teen shot black officer


In [10]:
# drop all null value 
token_15_clean = token_15[token_15['Tokenized_tweets'].notnull()]

In [11]:
len(token_15_clean)

9236748

In [12]:
length = [len(token_15_clean.Tokenized_tweets.iloc[i]) for i in range(len(token_15_clean))]

In [13]:
length.sort()
#length

In [14]:
import warnings
warnings.filterwarnings("ignore")
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot,iplot

In [15]:
init_notebook_mode(connected=True)

In [20]:
# Get the distribution of tokenized tweets length 
data = [go.Histogram(x=length)]
layout=go.Layout(title="Distribution of Tokenized Tweets Length", xaxis={'title':'Length of Tokenized Sentence'}, yaxis={'title':'Count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='basic histogram')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


### Feed data to NMF model

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

def print_topics(model, feature_names, top_k):
    """
    Print the most important words of each topics
    """
    for ind, topic in enumerate(model.components_):
        print("Topic #" + str(ind+1))
        # print out top k possible features(words)
        print([feature_names[i] for i in topic.argsort()[:-top_k-1:-1]])
    return

In [7]:
top_k = 20 # number of features(words) we want to print
n_topics = 20 # number of topics
random_seed = 1
l1_ratio = 0.5 # regularization
num_features = 10000 # number of features we want to include in model

In [15]:
vectorizer = TfidfVectorizer(max_features=num_features)
nmf_model = NMF(n_components=n_topics, random_state=random_seed, l1_ratio=l1_ratio)

In [17]:
text_tokenized = token_15_clean['Tokenized_tweets']

In [18]:
# There are around 9 million tweets and this time put the first  5 million to feed model
len(text_tokenized)

9236748

In [8]:
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=num_features,
                                    stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(text_tokenized[:5000000])
print("done in %0.3fs." % (time() - t0))




Extracting tf-idf features for NMF...


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


done in 68.836s.


In [23]:
feature_names = tfidf_vectorizer.get_feature_names()
feature_names

['00',
 '000',
 '0000',
 '000in',
 '001',
 '002',
 '003',
 '004',
 '005',
 '006',
 '007',
 '008',
 '009',
 '00kts',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '0700',
 '070415',
 '08',
 '09',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '100th',
 '101',
 '1010',
 '1015',
 '102',
 '102nd',
 '103',
 '1030',
 '104',
 '1045',
 '105',
 '106',
 '107',
 '108',
 '109',
 '1098',
 '10k',
 '10mi',
 '10pm',
 '10th',
 '10x',
 '11',
 '110',
 '1100',
 '111',
 '1111',
 '1115',
 '112',
 '113',
 '1130',
 '114',
 '1145',
 '115',
 '116',
 '117',
 '118',
 '119',
 '11pm',
 '11th',
 '12',
 '120',
 '1200',
 '12000',
 '121',
 '122',
 '122nd',
 '123',
 '1230',
 '125',
 '128',
 '129',
 '12th',
 '13',
 '130',
 '1300',
 '131',
 '134',
 '135',
 '136',
 '138',
 '13th',
 '14',
 '140',
 '1400',
 '145',
 '14th',
 '14u',
 '15',
 '150',
 '1500',
 '151',
 '152',
 '155',
 '159',
 '15th',
 '16',
 '160',
 '1600',
 '162nd',
 '165',
 '16th',
 '17',
 '170',
 '1700',
 '175',
 '17802',
 '17th',
 '18',
 '180',
 '1800',
 '1

In [None]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..." % (len(text_tokenized[:5000000]), num_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
exit()
print("done in %0.3fs." % (time() - t0))
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_topics(nmf, tfidf_feature_names, top_k) 

Fitting the NMF model with tf-idf features,n_samples=5000000 and n_features=10000...
done in 1125.197s.

Topics in NMF model:
Topic #1
['good', 'friend', 'morning', 'life', 'feel', 'look', 'thing', 'luck', 'world', 'way', 'people', 'today', 'hope', 'bad', 'year', 'pretty', 'ask', 'think', 'god', 'weekend']
Topic #2
['love', 'girl', 'life', 'check', 'easy', 'darkness', 'leave', 'baby', 'guy', 'people', 'friend', 'miss', 'ya', 'man', 'little', 'place', 'fall', 'hate', 'vid', 'beautiful']
Topic #3
['just', 'post', 'photo', 'video', 'park', 'add', 'menu', 'beer', 'lake', 'center', 'tap', 'museum', 'world', 'city', 'island', 'state', 'house', 'bridge', 'point', 'national']
Topic #4
['day', 'father', 'great', 'beautiful', 'today', 'dad', 'school', 'national', 'start', 'long', 'spend', 'summer', 'fathers', 'happy', 'lake', 'hope', 'perfect', 'fun', 'enjoy', 'tomorrow']
Topic #5
['new', 'york', 'ny', 'city', 'brooklyn', 'event', 'check', 'square', 'orleans', 'park', 'jersey', 'nyc', 'times', '