In [1]:
import json
import os
import gensim
from multiprocessing import Process, Queue
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
# nltk.download('wordnet')



def compute_coherence_values(corpus, dictionary, k, a='symmetric', b=None, coherence='u_mass', texts=None):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           workers=10)
    
    if coherence == 'u_mass':
        coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, coherence=coherence, processes=4)
    else:
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, coherence=coherence, processes=4)

    # print('coherence: ', coherence_model_lda.get_coherence())
    
    return coherence_model_lda.get_coherence()




def lemmatize_stemming(text):
    return SnowballStemmer('english').stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def get_preprocessed(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


def wrapped_load_tweets(pqueue, filenames, preprocess=False):
    all_tweets = []
    for filename in filenames:
        with open(filename, 'r') as f:
            # add commas between tweets to correct json syntax
            data = json.loads('['+f.read().replace('}{','},{')+']')
        # remove retweets
        tweets = [tweet for tweet in data if 'retweeted_status' not in tweet]
        # keep english language tweets only
        tweets = [tweet for tweet in tweets if tweet['lang'] == 'en']

        # take tweet text  or full_text if the tweet has that attribute
        if preprocess:
            ttexts = [ get_preprocessed(tweet['extended_tweet']['full_text']) if 'full_text' in tweet else get_preprocessed(tweet['text']) for tweet in tweets]
        else:
            ttexts = [ tweet['extended_tweet']['full_text'] if 'full_text' in tweet else tweet['text'] for tweet in tweets]


        all_tweets = all_tweets + ttexts


    if pqueue:
        pqueue.put(all_tweets[:50])
    return all_tweets

def partition(seq, num):
    # if num == 1:
    #     return [seq]

    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out


def load_tweets(filenames, preprocess=False, workers=1):
    assert type(filenames) == list
    partitioned_filenames = partition(filenames, workers)
    print(partitioned_filenames)
    pqueue = Queue()
    # make wprocesses
    procs = [ Process(target=wrapped_load_tweets, args=(pqueue, filenames, preprocess)) for filenames in partitioned_filenames ]
    # start processes
    for proc in procs:
        proc.start()
    # get results from all procs
    dumped = []
    print('returning')
    while not pqueue.empty():
        dumped.append(pqueue.get())

    # wait until all processes have finished
    print('joining')
    for proc in procs:
        proc.join()
    pqueue.close()
    pqueue.join_thread()

    # return flattened list of results
    return [tweet for tweets in dumped for tweet in tweets]



In [2]:
import os
filenames = ['data/unzipped/' + name for name in os.listdir('data/unzipped')]

data = load_tweets([filenames[0]])
# filenames[0]

[['data/unzipped/twitter-coronavirus-A-2020-05-01-04-48-29-6c51e071-3229-4b36-9f2d-5c9dc05b78f2']]
joining
returning


In [3]:
data

['Due to the coronavirus outbreak, April 2020 was the first April without a school shooting in the U.S. since 2001.',
 'Nursing Home On Queens-LI Border Reports 53 Coronavirus Deaths',
 '"Coronavirus Live Updates: In China, Loosened Restrictions Lead to Travel Rush" by Unknown Author via NYT New York… https://t.co/f0VIBgnRV3',
 "China's Banks are leveraged, low capitalized and are facing mounting credit losses.\n@chigrl @SCMPNews… https://t.co/cpFofBSDnM",
 "They won't have to, idiot. Take a guess how it ends. Guess. Fucking idiot. https://t.co/xsoo0GksSp",
 'Yes this is the correct news \n Pm of Russia suffering from corona',
 'Hey @realdonaldtrump  @potus the novel coronavirus did NOT originate in a laboratory in Wuhan!  You do not understa… https://t.co/wnXDwlArh4',
 'Cleaner Air Because of Coronavirus Lockdowns Is Saving Thousands of Lives https://t.co/Pz8mSHr7OR via @vice',
 '@HuXijin_GT The US, and our citizens, stand with Australia against Communist China. Hu, you arrogant prick