## Load data

In [100]:
import pandas as pd
header = ['Tweet ID', 'timestamp', 'week', 'user_id', 'state', 'original text', 'with_emoji_text', 'without_emoji_text','in_reply_to_status_id_str',  'emoji',
          'hashtag', 'media(type, url)', 'user_mentions', 'language']

data_2015_06 = pd.read_csv('output_2015-06.csv', header=None, names=header)

In [106]:
data_2015_06.head()

Unnamed: 0,Tweet ID,timestamp,week,user_id,state,original text,with_emoji_text,without_emoji_text,in_reply_to_status_id_str,emoji,hashtag,"media(type, url)",user_mentions,language
0,611053325644005376,1434536000.0,Wednesday,35298429,16,She says so unenthused,She says so unenthused,She says so unenthused,,,,,,en
1,611162865551192064,1434562000.0,Wednesday,2537204392,16,"""For every ailment under the sun there is a re...","""For every ailment under the sun there is a re...","""For every ailment under the sun there is a re...",,,,,,en
2,611058983927836673,1434537000.0,Wednesday,952640504,16,@jameshull88 yeah but still rest of conditioni...,yeah but still rest of conditioning this summer 😕,yeah but still rest of conditioning this summer,6.110588e+17,😕,,,444172192.0,en
3,611165766952591360,1434563000.0,Wednesday,2216437134,16,Reminded of how blessed I am when walking into...,Reminded of how blessed I am when walking into...,Reminded of how blessed I am when walking into...,,,"['studio7x47', 'HeritageBuilding']",,,en
4,611159320655347712,1434561000.0,Wednesday,14752306,16,"@thereval; \n""Unarmed White Teen Shot by Black...","; ""Unarmed White Teen Shot by Black Officer""","; ""Unarmed White Teen Shot by Black Officer""",,,"['WheresTheOutrage', 'HandsUpDontShoot']","('photo', 'https://pbs.twimg.com/media/CHtGIEU...",42389136.0,en


## Build NMF model 

Just a test case of small dataset, will run on the whole dataset several days after

### import packages & define a function

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

def print_topics(model, feature_names, top_k):
    """
    Print the most important words of each topics
    """
    for ind, topic in enumerate(model.components_):
        print("Topic #" + str(ind+1))
        # print out top k possible features(words)
        print([feature_names[i] for i in topic.argsort()[:-top_k-1:-1]])
    return 

### initialize parameters

In [108]:
top_k = 20 # number of features(words) we want to print
n_topics = 20 # number of topics
random_seed = 1
l1_ratio = 0.5 # regularization
num_features = 100 # number of features we want to include in model

### initialize model

In [109]:
vectorizer = TfidfVectorizer(max_features=num_features)
nmf_model = NMF(n_components=n_topics, random_state=random_seed, l1_ratio=l1_ratio)

### training model

In [110]:
train = list(data_2015_06["without_emoji_text"][:50])
test = list(data_2015_06["without_emoji_text"][50:70])
X = vectorizer.fit_transform(train)
feature_names = vectorizer.get_feature_names()
nmf = nmf_model.fit(X)

print_topics(nmf, feature_names, top_k)

Topic #1
['the', 'weight', 'in', 'savage', 'and', 'two', 'on', 'for', 'saw', 'work', 'love', 'they', 'road', 'this', 'to', 'owyhee', 'rocks', 'outdated', 'shape', 'probably']
Topic #2
['id', 'rooster', 'registered', 'work', 'want', 'in', 'off', 'boise', 'to', 'weight', 'play', 'realizing', 'passing', 'that', 'or', 'just', 'on', 'outdated', 'owyhee', 'past']
Topic #3
['is', 'there', 'rapist', 'remedy', 'or', 'for', 'be', 'owyhee', 'outdated', 'rocks', 'the', 'to', 'this', 'in', 'some', 'passing', 'maddie', 'me', 'porn', 'much']
Topic #4
['black', 'white', 'for', 'shape', 'probably', 'into', 'just', 'get', 'two', 'back', 'to', 'so', 'you', 'be', 'remedy', 'in', 'or', 'my', 'night', 'نقول']
Topic #5
['today', 'of', 'there', 'was', 'some', 'so', 'into', 'my', 'how', 'three', 'pressure', 'remedy', 'night', 'all', 'for', 'poodles', 'relieve', 'in', 'love', 'the']
Topic #6
['maddie', 'in', 'you', 'want', 'rapist', 'me', 'to', 'there', 'night', 'savage', 'just', 'get', 'and', 'even', 'work', '

### test model  
output will be probabilities of each topic for every tweet

In [111]:
tfidf_test = vectorizer.fit_transform(test)
result = nmf.transform(tfidf_test)
print(result.shape)
result[0] # probabilites for first test tweet

(20, 20)


array([0.00000000e+00, 1.52871611e-02, 0.00000000e+00, 3.45553134e-39,
       1.03042963e-01, 0.00000000e+00, 4.31155215e-02, 0.00000000e+00,
       7.14722258e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.60419488e-01, 1.17725162e-01, 4.12111400e-02,
       0.00000000e+00, 2.37999256e-02, 0.00000000e+00, 0.00000000e+00])