In [14]:
import numpy as np
import pandas as pd
from searchtweets import load_credentials, gen_request_parameters, collect_results

In [15]:
def get_tweets(term):
    credentials = load_credentials(env_overwrite=True)
    query = gen_request_parameters(f"{term} -is:retweet -is:reply lang:en", results_per_call=100, granularity=None)
    tweets = collect_results(query, max_tweets = 1000, result_stream_args=credentials)
    return tweets

def convert_to_df(messy_tweets):
    '''Turns messy json repsonse from get_tweets into a clean df.
    
    Args:
        messy_tweets (list of dicts extracted from json response): 'data' part of .json() parse
        
    Returns:
        df: df that has the extracted tweets
    '''
    clean = []
    
    for i in range(len(messy_tweets)):
        for j in range(len(messy_tweets[i]['data'])):
            clean.append(messy_tweets[i]['data'][j]['text'])
    
    df = pd.DataFrame(clean, columns = ['tweet'])
    df = df.drop_duplicates(ignore_index = True)
    return df

In [16]:
messy_tweets = get_tweets("Economy")
tweets = convert_to_df(messy_tweets)
tweets

cannot read file ~/.twitter_keys.yaml
Error parsing YAML file; searching for valid environment variables


Unnamed: 0,tweet
0,"Excited to get our first episode of ""Trips aro..."
1,⚡️Shmyhal: Ukraine's GDP may drop by 30-50% du...
2,#latestnews Russian economy crumbling as offic...
3,$VRT - Vertiv Holdings' (VRT) CEO Rob Johnson ...
4,"https://t.co/z7hHiOu6jQ In 2019, Latinos adde..."
...,...
983,Countries in #SouthAsia should steer away from...
984,#Airfare Deal: [American] New York - Dallas (a...
985,Naturally we are aware of the strength of our ...
986,But it's good that someone of international si...


In [17]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim

In [18]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

docs_clean = [clean(doc).split() for doc in tweets["tweet"]] # tokenized, put everything lowercase, removed stop words, removed punctuation, lemmatized everything.

In [19]:
dictionary = gensim.corpora.Dictionary(docs_clean)
bow_corpus = [dictionary.doc2bow(doc) for doc in docs_clean]

In [20]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=10, 
                                         id2word=dictionary, 
                                         passes=4, 
                                         workers=4,
                                         random_state=21)

In [21]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.037*"economy" + 0.004*"could" + 0.004*"never" + 0.004*"growth" + 0.004*"good" + 0.003*"get" + 0.003*"billion" + 0.003*"go" + 0.003*"money" + 0.003*"russia"
Topic: 1 
Words: 0.027*"economy" + 0.006*"business" + 0.006*"amp" + 0.005*"gas" + 0.004*"tax" + 0.004*"california" + 0.004*"make" + 0.004*"people" + 0.004*"nation" + 0.004*"already"
Topic: 2 
Words: 0.040*"economy" + 0.008*"gas" + 0.005*"u" + 0.005*"business" + 0.004*"amp" + 0.004*"tax" + 0.004*"would" + 0.004*"russia" + 0.004*"russian" + 0.004*"finance"
Topic: 3 
Words: 0.017*"economy" + 0.010*"amp" + 0.005*"inflation" + 0.004*"economic" + 0.003*"price" + 0.003*"need" + 0.003*"fact" + 0.003*"energy" + 0.003*"new" + 0.003*"pm"
Topic: 4 
Words: 0.032*"economy" + 0.006*"among" + 0.006*"picture" + 0.006*"loss" + 0.006*"pressure" + 0.006*"money" + 0.006*"land" + 0.006*"test" + 0.006*"benefit" + 0.006*"debate"
Topic: 5 
Words: 0.049*"economy" + 0.006*"business" + 0.006*"—" + 0.006*"want" + 0.005*"via" + 0.005*"u" + 0.0

In [22]:
from tqdm import tqdm
def format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=tweets["tweet"]):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [23]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=tweets["tweet"])
df_topic_sents_keywords

988it [00:01, 858.75it/s]


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,tweet
0,4.0,0.9735,"economy, among, picture, loss, pressure, money...","Excited to get our first episode of ""Trips aro..."
1,5.0,0.9719,"economy, business, —, want, via, u, inflation,...",⚡️Shmyhal: Ukraine's GDP may drop by 30-50% du...
2,3.0,0.9307,"economy, amp, inflation, economic, price, need...",#latestnews Russian economy crumbling as offic...
3,2.0,0.9500,"economy, gas, u, business, amp, tax, would, ru...",$VRT - Vertiv Holdings' (VRT) CEO Rob Johnson ...
4,5.0,0.9690,"economy, business, —, want, via, u, inflation,...","https://t.co/z7hHiOu6jQ In 2019, Latinos adde..."
...,...,...,...,...
983,3.0,0.9667,"economy, amp, inflation, economic, price, need...",Countries in #SouthAsia should steer away from...
984,8.0,0.7136,"economy, u, tax, people, make, nation, country...",#Airfare Deal: [American] New York - Dallas (a...
985,7.0,0.9308,"economy, people, war, ukraine, economic, world...",Naturally we are aware of the strength of our ...
986,0.0,0.9571,"economy, could, never, growth, good, get, bill...",But it's good that someone of international si...


In [24]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=docs_clean, dictionary=lda_model.id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.32312738143081715
