In [1]:
import numpy as np
import pandas as pd
import os
from searchtweets import load_credentials, gen_request_parameters, collect_results

In [3]:
def get_tweets(term):
    credentials = load_credentials(env_overwrite=True)
    query = gen_request_parameters(f"{term} -is:retweet -is:reply lang:en", results_per_call=100, granularity=None)
    tweets = collect_results(query, max_tweets = 1000, result_stream_args=credentials)
    return tweets

def convert_to_df(messy_tweets):
    '''Turns messy json repsonse from get_tweets into a clean df.
    
    Args:
        messy_tweets (list of dicts extracted from json response): 'data' part of .json() parse
        
    Returns:
        df: df that has the extracted tweets
    '''
    clean = []
    
    for i in range(len(messy_tweets)):
        for j in range(len(messy_tweets[i]['data'])):
            clean.append(messy_tweets[i]['data'][j]['text'])
    
    df = pd.DataFrame(clean, columns = ['tweet'])
    df = df.drop_duplicates(ignore_index = True)
    return df

In [4]:
messy_tweets = get_tweets("Economy")
tweets = convert_to_df(messy_tweets)
tweets

cannot read file ~/.twitter_keys.yaml
Error parsing YAML file; searching for valid environment variables


Unnamed: 0,tweet
0,One thing that will never die in this economy ...
1,I just joined the @KoiiNetwork Web3 economy. P...
2,The latest The sharing economy Daily! https://...
3,"Your advice is rubbish Cde 4 Fingers, it shoul..."
4,The Great Resignation is rapidly turning into ...
...,...
950,"Culture First, Economy Next\n\n（I Suman）"
951,Economy Is Rebounding – Ken Ofori-Atta https:/...
952,"Russian Oil, Markets, And The US Economy (Podc..."
953,Economy Is Rebounding – Ken Ofori-Atta https:/...


In [14]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim

In [11]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

docs_clean = [clean(doc).split() for doc in tweets["tweet"]] # tokenized, put everything lowercase, removed stop words, removed punctuation, lemmatized everything.

In [15]:
dictionary = gensim.corpora.Dictionary(docs_clean)
bow_corpus = [dictionary.doc2bow(doc) for doc in docs_clean]

In [16]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=10, 
                                         id2word=dictionary, 
                                         passes=4, 
                                         workers=4,
                                         random_state=21)

In [17]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.036*"economy" + 0.008*"business" + 0.007*"u" + 0.006*"make" + 0.005*"help" + 0.005*"—" + 0.004*"professional" + 0.004*"administrative" + 0.004*"work" + 0.004*"want"
Topic: 1 
Words: 0.017*"economy" + 0.004*"one" + 0.004*"today" + 0.004*"vote" + 0.003*"get" + 0.003*"information" + 0.003*"peter" + 0.003*"much" + 0.003*"want" + 0.003*"make"
Topic: 2 
Words: 0.033*"economy" + 0.005*"inflation" + 0.004*"u" + 0.004*"digital" + 0.004*"like" + 0.004*"great" + 0.004*"know" + 0.004*"year" + 0.004*"news" + 0.003*"tax"
Topic: 3 
Words: 0.033*"economy" + 0.010*"amp" + 0.008*"global" + 0.005*"gas" + 0.005*"world" + 0.005*"hit" + 0.004*"one" + 0.004*"good" + 0.004*"new" + 0.004*"russia"
Topic: 4 
Words: 0.028*"economy" + 0.011*"u" + 0.011*"trade" + 0.008*"power" + 0.008*"year" + 0.007*"amp" + 0.006*"deficit" + 0.006*"good" + 0.006*"billion" + 0.006*"jump"
Topic: 5 
Words: 0.038*"economy" + 0.009*"u" + 0.005*"help" + 0.003*"new" + 0.003*"business" + 0.003*"market" + 0.003*"year" + 0

In [18]:
from tqdm import tqdm
def format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=tweets["tweet"]):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [19]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=tweets["tweet"])
df_topic_sents_keywords

955it [00:01, 852.81it/s]


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,tweet
0,9.0,0.9250,"economy, year, inflation, people, word, via, r...",One thing that will never die in this economy ...
1,6.0,0.9250,"economy, future, pay, now, web3, attention, u,...",I just joined the @KoiiNetwork Web3 economy. P...
2,4.0,0.9100,"economy, u, trade, power, year, amp, deficit, ...",The latest The sharing economy Daily! https://...
3,1.0,0.9719,"economy, one, today, vote, get, information, p...","Your advice is rubbish Cde 4 Fingers, it shoul..."
4,2.0,0.9100,"economy, inflation, u, digital, like, great, k...",The Great Resignation is rapidly turning into ...
...,...,...,...,...
950,9.0,0.8714,"economy, year, inflation, people, word, via, r...","Culture First, Economy Next\n\n（I Suman）"
951,3.0,0.5691,"economy, amp, global, gas, world, hit, one, go...",Economy Is Rebounding – Ken Ofori-Atta https:/...
952,9.0,0.7822,"economy, year, inflation, people, word, via, r...","Russian Oil, Markets, And The US Economy (Podc..."
953,3.0,0.8714,"economy, amp, global, gas, world, hit, one, go...",Economy Is Rebounding – Ken Ofori-Atta https:/...
