In [123]:
import numpy as np
import pandas as pd
import os
from searchtweets import load_credentials, gen_request_parameters, collect_results

In [125]:
def get_tweets(term):
    credentials = load_credentials(env_overwrite=True)
    query = gen_request_parameters(f"{term} -is:retweet -is:reply lang:en", results_per_call=100, granularity=None)
    tweets = collect_results(query, max_tweets = 1000, result_stream_args=credentials)
    return tweets

def convert_to_df(messy_tweets):
    '''Turns messy json repsonse from get_tweets into a clean df.
    
    Args:
        messy_tweets (list of dicts extracted from json response): 'data' part of .json() parse
        
    Returns:
        df: df that has the extracted tweets
    '''
    clean = []
    
    for i in range(len(messy_tweets)):
        for j in range(len(messy_tweets[i]['data'])):
            clean.append(messy_tweets[i]['data'][j]['text'])
    
    df = pd.DataFrame(clean, columns = ['tweet'])
    df = df.drop_duplicates(ignore_index = True)
    return df

In [126]:
messy_tweets = get_tweets("Biden")
tweets = convert_to_df(messy_tweets)
tweets

cannot read file ~/.twitter_keys.yaml
Error parsing YAML file; searching for valid environment variables


Unnamed: 0,tweet
0,Biden flips the switch on Trump's incandescent...
1,Jen Psaki has no information that Joe Biden ev...
2,Joe Biden still hasn’t tear-gassed a church pa...
3,Forced Masking Ruled Unconstitutional: Biden P...
4,How Biden Can Halt the U.S. Postal Service’s G...
...,...
992,‘Joe Biden’ is an actor.
993,"Once again, I just really want to thank the 80..."
994,Biden and Obama are responsible for the deaths...
995,GOP Planning To Launch Impeachment Of DHS Secr...


# Clean tweets

In [127]:
import nltk
from nltk.corpus import stopwords
import string 
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
punctuations = string.punctuation
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leocorelli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leocorelli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leocorelli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [128]:
test_tweets = tweets.copy()

In [129]:
test_tweets["tweet"] = test_tweets["tweet"].apply(lambda x: ' '.join(wordnet_lemmatizer.lemmatize(word).lower().strip() for word in x.split() if word.lower() not in stop_words and word[0]!="@" and word[:5]!="https"))
test_tweets

Unnamed: 0,tweet
0,biden flip switch trump's incandescent light b...
1,jen psaki information joe biden ever spoke hun...
2,joe biden still hasn’t tear-gassed church park...
3,forced masking ruled unconstitutional: biden p...
4,biden halt u.s. postal service’s gas-guzzling ...
...,...
992,‘joe biden’ actor.
993,"again, really want thank 80,000 voter maine vo..."
994,biden obama responsible death seal team 6 exto...
995,gop planning launch impeachment dhs secretary ...


# Bertopic

In [130]:
from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(test_tweets["tweet"])

In [131]:
info = topic_model.get_topic_info()

In [133]:
test_df = pd.concat([tweets["tweet"], test_tweets["tweet"]],axis=1)
test_df.columns = ["original_tweet","cleaned_tweet"]
test_df

Unnamed: 0,original_tweet,cleaned_tweet
0,Biden flips the switch on Trump's incandescent...,biden flip switch trump's incandescent light b...
1,Jen Psaki has no information that Joe Biden ev...,jen psaki information joe biden ever spoke hun...
2,Joe Biden still hasn’t tear-gassed a church pa...,joe biden still hasn’t tear-gassed church park...
3,Forced Masking Ruled Unconstitutional: Biden P...,forced masking ruled unconstitutional: biden p...
4,How Biden Can Halt the U.S. Postal Service’s G...,biden halt u.s. postal service’s gas-guzzling ...
...,...,...
992,‘Joe Biden’ is an actor.,‘joe biden’ actor.
993,"Once again, I just really want to thank the 80...","again, really want thank 80,000 voter maine vo..."
994,Biden and Obama are responsible for the deaths...,biden obama responsible death seal team 6 exto...
995,GOP Planning To Launch Impeachment Of DHS Secr...,gop planning launch impeachment dhs secretary ...


In [136]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,214,-1_biden_president_us_amp
1,0,164,0_biden_election_trump_2020
2,1,75,1_joe_biden_fuck_supporter
3,2,71,2_laptop_story_twitter_hunter
4,3,50,3_income_unexplained_reveal_financial
5,4,38,4_student_loan_debt_canceling
6,5,34,5_ha_garland_old_beings
7,6,31,6_ukraine_russia_putin_nato
8,7,30,7_border_illegal_immigration_bidens
9,8,29,8_partner_business_white_house


In [135]:
topic_model.get_representative_docs()

{14: ['biden administration phase inefficient lightbulb championed trump via',
  'biden administration phase inefficient lightbulb championed trump via',
  "funny article claim manufacturer want make sell incandescent light bulb &amp; consumer really want buy bulbs. true, federal gov. (@potus biden's need make rule?"],
 8: ['joe biden met hunter biden business partner white house via',
  'white house visitor logs reveal meetings hunter biden business partner joe biden',
  'hunter biden business partner made 8 additional white house visit 2016, visitor log show'],
 1: ['wanna see joe biden smoke weed', 'fuck joe biden', 'fuck joe biden'],
 9: ['gasoline price high enough.',
  'report, u.s. gas exports triple u.s. gas production, low gas reserves sends prices soaring - biden effect!',
  'thanx 2 biden! $58 year ago...not tank gas.'],
 5: ["anyone ever think garland would ever investigate hunter biden joe biden that's attorney general evil president joe biden son hunter. remember twitter 

In [107]:
def get_clusters(info):
    
    clusters = info.copy()
    clusters.sort_values(by="Topic")
    
    
    clusters["clean_clusters"] = 0
    
    for i in range(clusters.shape[0]):
        clusters["clean_clusters"].loc[i] = clusters["Name"].loc[i].split("_")[1:]
        
    clusters = clusters[clusters["Topic"]!=-1] # remove unclustered items
    clusters = clusters.reset_index(drop=True)
    
    return clusters

In [108]:
this = get_clusters(info)
this

Unnamed: 0,Topic,Count,Name,clean_clusters
0,0,71,0_biden_is_great_good,"[biden, is, great, good]"
1,1,70,1_pardon_man_clemency_first,"[pardon, man, clemency, first]"
2,2,69,2_border_illegal_immigration_administration,"[border, illegal, immigration, administration]"
3,3,69,3_ukraine_war_putin_climate,"[ukraine, war, putin, climate]"
4,4,64,4_student_debt_loan_cancel,"[student, debt, loan, cancel]"
5,5,62,5_twitter_musk_elon_censored,"[twitter, musk, elon, censored]"
6,6,55,6_joe_biden_fuck_like,"[joe, biden, fuck, like]"
7,7,37,7_unexplained_income_financial_reveal,"[unexplained, income, financial, reveal]"
8,8,34,8_harris_kamala_close_positive,"[harris, kamala, close, positive]"
9,9,33,9_covid_vaccine_covid19_funding,"[covid, vaccine, covid19, funding]"
