# Parsing and cleaning tweets
This notebook is a slight modification of @wwymak's word2vec notebook, with different tokenization, and a way to iterate over tweets linked to their named user 

### WWmyak's iterator and helper functions

In [1]:
import gensim
import os
import numpy as np
import itertools
import json
import re
import pymoji
import importlib
from nltk.tokenize import TweetTokenizer
from gensim import corpora
import string
from nltk.corpus import stopwords
from six import iteritems

tokenizer = TweetTokenizer()

def keep_retweets(tweets_objs_arr):
    return [x["text"] for x in tweets_objs_arr if x['retweet'] != 'N'], [x["name"] for x in tweets_objs_arr if x['retweet'] != 'N']

def convert_emojis(tweets_arr):
    return [pymoji.replaceEmojiAlt(x, trailingSpaces=1) for x in tweets_arr]

def tokenize_tweets(tweets_arr):
    result = []
    for x in tweets_arr:
        try:
            tokenized = tokenizer.tokenize(x)
            result.append([x.lower() for x in tokenized if x not in string.punctuation])
        except:
            pass
#             print(x)
    return result

class Tweets(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for root, directories, filenames in os.walk(self.dirname):
            for filename in filenames:
                if(filename.endswith('json')):
                    print(root + filename)
                    with open(os.path.join(root,filename), 'r') as f:
                        data = json.load(f)
                        data_parsed_step1, user_names = keep_retweets(data)
                        data_parsed_step2 = convert_emojis(data_parsed_step1)
                        data_parsed_step3 = tokenize_tweets(data_parsed_step2)
                        for data, name in zip(data_parsed_step3, user_names):
                            yield name, data


sentences = Tweets('/media/henripal/hd1/data/2017/1/') # a memory-friendly iterator
#model = gensim.models.Word2Vec(sentences, workers=2, window=5, sg = 1, size = 100, max_vocab_size = 2 * 10000000)
#model.save('tweets_word2vec_2017_1_size100_window5')
#print('done')
#print(time.time() - start_time)


### My gensim tinkering
Tasks:
- build the gensim dictionary
- build the bow matrix using this dictionary (sparse matrix so memory friendly)
- save the names and the dicitionary for later use

In [2]:
# building the dictionary first, from the iterator
# maybe I should build the dictionary and the corpora at the same time?
dictionary = corpora.Dictionary((tweet for _, tweet in sentences))

/media/henripal/hd1/data/2017/1/27/12tweets-47.json
/media/henripal/hd1/data/2017/1/27/12tweets-35.json
/media/henripal/hd1/data/2017/1/27/12tweets-48.json
/media/henripal/hd1/data/2017/1/27/7tweets-6.json
/media/henripal/hd1/data/2017/1/27/7tweets-46.json
/media/henripal/hd1/data/2017/1/27/7tweets-48.json
/media/henripal/hd1/data/2017/1/27/14tweets-31.json
/media/henripal/hd1/data/2017/1/27/14tweets-34.json
/media/henripal/hd1/data/2017/1/27/14tweets-50.json
/media/henripal/hd1/data/2017/1/27/14tweets-7.json
/media/henripal/hd1/data/2017/1/27/14tweets-45.json
/media/henripal/hd1/data/2017/1/27/14tweets-19.json
/media/henripal/hd1/data/2017/1/27/1tweets-29.json
/media/henripal/hd1/data/2017/1/27/1tweets-12.json
/media/henripal/hd1/data/2017/1/27/1tweets-30.json
/media/henripal/hd1/data/2017/1/27/1tweets-15.json
/media/henripal/hd1/data/2017/1/27/5tweets-6.json
/media/henripal/hd1/data/2017/1/27/5tweets-23.json
/media/henripal/hd1/data/2017/1/27/5tweets-41.json
/media/henripal/hd1/data/

In [4]:
# here we use the downloaded  stopwords from nltk and create the list
# of stop ids using the hash defined above
stop = set(stopwords.words('english'))
stop_ids = [dictionary.token2id[stopword] for stopword in stop if stopword in dictionary.token2id]

# and this is the items we don't want - that appear less than 20 times
# hardcoded numbers FTW
low_freq_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq  <20]

In [12]:
# finally we filter the dictionary and compactify
dictionary.filter_tokens(stop_ids + low_freq_ids)
dictionary.compactify()  # remove gaps in id sequence after words that were removed
print(dictionary)

In [16]:
# this creates our iterator (memory friendly!)
sentences = Tweets('/media/henripal/hd1/data/2017/1/')

In [17]:
# this takes a while... creating the corpus
corpus = [dictionary.doc2bow(tweet) for _, tweet in sentences]

/media/henripal/hd1/data/2017/1/27/12tweets-47.json
/media/henripal/hd1/data/2017/1/27/12tweets-35.json
/media/henripal/hd1/data/2017/1/27/12tweets-48.json
/media/henripal/hd1/data/2017/1/27/7tweets-6.json
/media/henripal/hd1/data/2017/1/27/7tweets-46.json
/media/henripal/hd1/data/2017/1/27/7tweets-48.json
/media/henripal/hd1/data/2017/1/27/14tweets-31.json
/media/henripal/hd1/data/2017/1/27/14tweets-34.json
/media/henripal/hd1/data/2017/1/27/14tweets-50.json
/media/henripal/hd1/data/2017/1/27/14tweets-7.json
/media/henripal/hd1/data/2017/1/27/14tweets-45.json
/media/henripal/hd1/data/2017/1/27/14tweets-19.json
/media/henripal/hd1/data/2017/1/27/1tweets-29.json
/media/henripal/hd1/data/2017/1/27/1tweets-12.json
/media/henripal/hd1/data/2017/1/27/1tweets-30.json
/media/henripal/hd1/data/2017/1/27/1tweets-15.json
/media/henripal/hd1/data/2017/1/27/5tweets-6.json
/media/henripal/hd1/data/2017/1/27/5tweets-23.json
/media/henripal/hd1/data/2017/1/27/5tweets-41.json
/media/henripal/hd1/data/

In [19]:
# this also takes a while... creating the list of users
# probably I should bundle it with the previous step
names = [name for name, _ in sentences]

/media/henripal/hd1/data/2017/1/27/12tweets-47.json
/media/henripal/hd1/data/2017/1/27/12tweets-35.json
/media/henripal/hd1/data/2017/1/27/12tweets-48.json
/media/henripal/hd1/data/2017/1/27/7tweets-6.json
/media/henripal/hd1/data/2017/1/27/7tweets-46.json
/media/henripal/hd1/data/2017/1/27/7tweets-48.json
/media/henripal/hd1/data/2017/1/27/14tweets-31.json
/media/henripal/hd1/data/2017/1/27/14tweets-34.json
/media/henripal/hd1/data/2017/1/27/14tweets-50.json
/media/henripal/hd1/data/2017/1/27/14tweets-7.json
/media/henripal/hd1/data/2017/1/27/14tweets-45.json
/media/henripal/hd1/data/2017/1/27/14tweets-19.json
/media/henripal/hd1/data/2017/1/27/1tweets-29.json
/media/henripal/hd1/data/2017/1/27/1tweets-12.json
/media/henripal/hd1/data/2017/1/27/1tweets-30.json
/media/henripal/hd1/data/2017/1/27/1tweets-15.json
/media/henripal/hd1/data/2017/1/27/5tweets-6.json
/media/henripal/hd1/data/2017/1/27/5tweets-23.json
/media/henripal/hd1/data/2017/1/27/5tweets-41.json
/media/henripal/hd1/data/

In [27]:
# and as always we don't want to redo this ever so we save the files

# saving the names as csv
wrt = csv.writer(open("/media/henripal/hd1/data/corpus_names.txt",'w'), delimiter=',', lineterminator='\n')
wrt.writerow(names)

# now we save the sparse bow corpus matrix using matrix market format
corpora.MmCorpus.serialize('corp.mm', corpus)

# and we save the dictionary as a text file
dictionary.save_as_text('dict_text')

66952645