In [1]:
# importing libraries for the API connection

import tweepy
from tweepy.auth import OAuthHandler
import pandas as pd

In [2]:
# generating a dict with the keys for the API connection

secrets_dict={}
secrets_file = open('tweepy-keys.txt')
for line in secrets_file:
  (key,value) = line.split(':')
  secrets_dict[key] = value[:-1]

In [3]:
# creating the API cursor

auth = tweepy.OAuthHandler(secrets_dict['API Key'], secrets_dict['API secret'])
auth.set_access_token(secrets_dict['Access token'], secrets_dict['Access secret'])
api = tweepy.API(auth)

In [4]:
def scrape(words, date_since, numtweet):

# Creating DataFrame using pandas
  db = pd.DataFrame(columns=['username', 'description', 'location', 'following',
              'followers', 'totaltweets', 'retweetcount', 'text', 'hashtags'])

# We are using .Cursor() to search through twitter for the required tweets.
# The number of tweets can be restricted using .items(number of tweets)
  tweets = tweepy.Cursor(api.search, q=words, lang="en",
            since=date_since, tweet_mode='extended').items(numtweet)

# .Cursor() returns an iterable object. Each item in
# the iterator has various attributes that you can access to
# get information about each tweet
  list_tweets = [tweet for tweet in tweets]

# Counter to maintain Tweet Count
  i = 1

# we will iterate over each tweet in the list for extracting information about each tweet
  for tweet in list_tweets:
    username = tweet.user.screen_name
    description = tweet.user.description
    location = tweet.user.location
    following = tweet.user.friends_count
    followers = tweet.user.followers_count
    totaltweets = tweet.user.statuses_count
    retweetcount = tweet.retweet_count
    hashtags = tweet.entities['hashtags']
  
# Retweets can be distinguished by a retweeted_status attribute,
# in case it is an invalid reference, except block will be executed
    try:
      text = tweet.retweeted_status.full_text
    except AttributeError:
      text = tweet.full_text
    hashtext = list()
    for j in range(0, len(hashtags)):
      hashtext.append(hashtags[j]['text'])

    # Here we are appending all the extracted information in the DataFrame
    ith_tweet = [username, description, location, following,
          followers, totaltweets, retweetcount, text, hashtext]
    db.loc[len(db)] = ith_tweet

  

# we will save our database as a CSV file.
  return db

In [5]:
# define forloop that scrapes the necessary tweets and concatenates them

import time # twitter limitates the amount of scrapping you can do, so we need to do timesleep
from datetime import date

# politic is the list of politicians to scrape for
# Enter Date since The Tweets are required in yyyy-mm-dd
# The minutes its the time it will spend between each iteration of the scrape
# numtweet is the number of tweets for each hashtag
# file is the older file that i might have and i want to concatenate with

# WARNING: Remove the default politicians before deploying

def scrapping_engine(date_,politic=['#JoeBiden','#BernieSanders','#MikePence','#TedCruz'], minutes=15, numtweet=500, file=0):

    # we need to create a dataframe to deposit the tweets that we scrape

    columns = ['target','Unnamed: 0', 'username', 'description', 'location', 'following',
           'followers', 'totaltweets', 'retweetcount', 'text', 'hashtags'] # this are the columns that for the dataframe

    politic_df = pd.DataFrame(dict(), columns=columns) # creation of the dataframe

    # this is the loop for the scrapping of every politician

    for element in politic:
        subdata = scrape(element, date_, numtweet)
        subdata['target'] = element
        politic_df = politic_df.append(subdata)
        time.sleep(60 * minutes) # need to take it easy!
        
    csv = file
    
    politic_df = appending(politic_df, csv)
    
    spams_count = dict(politic_df['text'].value_counts())

    spams = []

    for x,y in spams_count.items():
        if y > 1:
            spams.append(x)

    counter = 0

    politic_df = politic_df.reset_index()

    for element in politic_df['text']:
        if element in spams:
            politic_df.drop(index=counter, inplace=True)
        counter += 1
    
    new_date = 'scraped_from_' + date_ + '_to_' + str(date.today())
    
    if csv != 0:
        politic_df.to_csv(new_date + '.csv', index_label=False)
        return politic_df
    else:
        politic_df.to_csv(new_date + '.csv', index_label=False)
        return politic_df


In [6]:
# define a function that adds the newly scraped file to the rest

def appending(db, file):
    older = pd.read_csv(file)
    return db.append(older) # returns the new database appended

In [7]:
def politicians_input():
    print('Enter the politicians names (separated by comma, E.g: ´Joe Biden, Bernie Sanders´): ')
    string_ = input()
    string = string_.replace(' ','')
    list_strings = string.split(',')

    return [('#' + x) for x in list_strings]

In [None]:
to_scrape = politicians_input()

# do a function for date input

politic_df = scrapping_engine('2021-07-22', to_scrape, 0.1, 10, 'data.csv')

In [10]:
politic_df = pd.read_csv('scraped_from_2021-07-22_to_2021-07-23.csv')

In [11]:
politic_df.shape

(1049, 13)

In [12]:
politic_df.head()

Unnamed: 0.2,index,target,Unnamed: 0,username,description,location,following,followers,totaltweets,retweetcount,text,hashtags,Unnamed: 0.1
0,0,#JoeBiden,,Michell81478626,"I live in Minnesota, I love my cat, dog and bi...","Minnesota, USA",69,12,60,2,#Congress #chuckSchumer #nancyPelosi #democrat...,"['Congress', 'chuckSchumer', 'nancyPelosi', 'd...",
2,2,#JoeBiden,,DerekOsheaShow,Derek O'Shea Show Politically Homeless Daily C...,Everywhere you want to be,3435,781,5095,0,Who is Defunding Police | Conspiracy Theories ...,"['Covid19News', 'JoeBiden', 'BreakingNews', 'p...",
3,3,#JoeBiden,,Tim20026046,"White-haired, upper fifties, libertarian dad, ...",,485,17,4239,98,"#JoeBiden Tells The World Another Whopper ""You...",['JoeBiden'],
4,4,#JoeBiden,,lilgoddamn_III,#bitcoin\n§LilGoddamn,The Moon 🌕,900,297,28972,57,Democrats Future Agenda...\n\n#NacyPelosi :\nL...,"['NacyPelosi', 'JoeBiden']",
5,5,#JoeBiden,,LisaMaret,"Founder, Tea Party WDC ""Return 2 commerce, cha...",Alexandria Va,1446,1735,69535,0,"@taxreformer Big surprise, the IRS doesn't aud...","['LarcenousClass', 'Democrats', 'JoeBiden', 'P...",


In [13]:
# now we need to clean the tweets
# importing the necessaire packages


import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet')
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# the main cleaning step

def clean_up(s):
    element1 = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', s) # remove links
    element2 = re.sub('[^a-zA-Z0-9]', ' ', element1) # remove non character symbols
    element3 = re.sub('amp', '', element2) # twitter has &amp as a special character
    return (re.sub('\d+',' ',element3)).lower() # remove any digits and lowercase everything

In [15]:
# tokenize the text

def tokenize(s):
    return word_tokenize(s)

In [16]:
# categorize function to help the next function that is lemmatize

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ, 
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist

In [17]:
# lemmatize to reduce the word to it most radical form

def lemmatize(l):
  
    lem = WordNetLemmatizer()
    lemmatized = [lem.lemmatize(w,get_wordnet_pos(w)) for w in l]
    
    return lemmatized

In [21]:
# remove the stop words

def remove_stopwords(l):
    
    filtered_sentence = []
    
    for w in l:
        if len(w) > 1:
            if w not in stopwords.words('english'):
                filtered_sentence.append(w)
    
    return filtered_sentence

In [19]:
# function that combines all of the cleaning functions and does everything

def cleaning_engine(df,column='text'):
    df['text_processed'] = df[column].apply(clean_up)
    df['text_processed'] = df['text_processed'].apply(tokenize)
    df['text_processed'] = df['text_processed'].apply(lemmatize)
    return df['text_processed'].apply(remove_stopwords)
    

In [22]:
politic_df['text_processed'] = cleaning_engine(politic_df)

In [23]:
politic_df['text_processed'].head()

0    [congress, chuckschumer, nancypelosi, democrat...
2    [defunding, police, conspiracy, theory, town, ...
3    [joebiden, tell, world, another, whopper, get,...
4    [democrat, future, nacypelosi, latin, future, ...
5    [taxreformer, big, surprise, irs, audit, larce...
Name: text_processed, dtype: object

In [24]:
politic_df['text_preprocessed'] = politic_df['text'].apply(clean_up)

In [28]:
test = politic_df['text_preprocessed'][0].strip()

In [91]:
features = politic_df[['target','text_processed','text_preprocessed']].reset_index()
features.head()

Unnamed: 0,index,target,text_processed,text_preprocessed
0,0,#JoeBiden,"[congress, chuckschumer, nancypelosi, democrat...",congress chuckschumer nancypelosi democrat...
1,2,#JoeBiden,"[defunding, police, conspiracy, theory, town, ...",who is defunding police conspiracy theories ...
2,3,#JoeBiden,"[joebiden, tell, world, another, whopper, get,...",joebiden tells the world another whopper you...
3,4,#JoeBiden,"[democrat, future, nacypelosi, latin, future, ...",democrats future a nacypelosi latin s are ...
4,5,#JoeBiden,"[taxreformer, big, surprise, irs, audit, larce...",taxreformer big surprise the irs doesn t aud...


In [139]:
features['text_preprocessed'][0]

' congress  chuckschumer  nancypelosi  democrats  joebiden  kamalaharris  purplepower the time for action in now  no more vacations  no luncheons it s time to fix this country and get them back on their feet  back to work and out of the bread lines multiple stimulus checks now '

In [106]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pprint import pprint

blob = politic_df['text_processed'].tolist()

whole = []

for element in blob:
    whole += element

text = ' '.join(whole)

doc = nlp(text)


labels = [x.label_ for x in doc.ents]
display(Counter(labels))

items = [x.text for x in doc.ents]
counter_items = dict(Counter(items))

Counter({'ORG': 488,
         'NORP': 246,
         'PERSON': 450,
         'GPE': 404,
         'LOC': 16,
         'DATE': 68,
         'PRODUCT': 36,
         'CARDINAL': 29,
         'ORDINAL': 14,
         'TIME': 11,
         'WORK_OF_ART': 3,
         'MONEY': 3})

In [107]:
entities = []

for x,y in counter_items.items():
    if y >= 10:
        entities.append(x)
        
entities
    

['democrat',
 'america',
 'american',
 'cuba',
 'nyc',
 'texan',
 'mike penny',
 'joe biden',
 'cnn',
 'gop',
 'florida',
 'senate',
 'texas',
 'republican',
 'one',
 'china',
 'first',
 'usa',
 'today',
 'msnbc',
 'cuban',
 'ted cruz',
 'morningjoe mikeparson',
 'dougducey marcorubio delta variant kristinoem',
 'cbs',
 'republicoftexas',
 'mitchmcconnell',
 'kristinoem',
 'kevinmccarthy majorietaylorgreene mikeparson',
 'marcorubio kristinoem',
 'dougducey',
 'abc cnn',
 'msnbc gop']

In [133]:
entities

['democrat',
 'america',
 'american',
 'cuba',
 'nyc',
 'texan',
 'mike penny',
 'joe biden',
 'cnn',
 'gop',
 'florida',
 'senate',
 'texas',
 'republican',
 'one',
 'china',
 'first',
 'usa',
 'today',
 'msnbc',
 'cuban',
 'ted cruz',
 'cbs',
 'republicoftexas',
 'mitchmcconnell',
 'kristinoem',
 'dougducey']

In [134]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vect = CountVectorizer(vocabulary=entities)

X = bow_vect.fit_transform(features['text_preprocessed']).toarray()

In [135]:
X = pd.DataFrame(X, columns=entities)

In [136]:
X.head(1)

Unnamed: 0,democrat,america,american,cuba,nyc,texan,mike penny,joe biden,cnn,gop,...,usa,today,msnbc,cuban,ted cruz,cbs,republicoftexas,mitchmcconnell,kristinoem,dougducey
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def is_positive(tweet):
    if sia.polarity_scores(tweet)["compound"] > 0:
        return 1
    return 0

features['sentiment'] = features['text_preprocessed'].apply(is_positive)

In [113]:
features.head()

Unnamed: 0,index,target,text_processed,text_preprocessed,sentiment
0,0,#JoeBiden,"[congress, chuckschumer, nancypelosi, democrat...",congress chuckschumer nancypelosi democrat...,0
1,2,#JoeBiden,"[defunding, police, conspiracy, theory, town, ...",who is defunding police conspiracy theories ...,0
2,3,#JoeBiden,"[joebiden, tell, world, another, whopper, get,...",joebiden tells the world another whopper you...,0
3,4,#JoeBiden,"[democrat, future, nacypelosi, latin, future, ...",democrats future a nacypelosi latin s are ...,1
4,5,#JoeBiden,"[taxreformer, big, surprise, irs, audit, larce...",taxreformer big surprise the irs doesn t aud...,1


In [120]:
def intensity(tweet):
    return (abs(sia.polarity_scores(tweet)["compound"])+1)**2

features['intensity'] = features['text_preprocessed'].apply(intensity)


In [123]:
features.head()

Unnamed: 0,index,target,text_processed,text_preprocessed,sentiment,intensity
0,0,#JoeBiden,"[congress, chuckschumer, nancypelosi, democrat...",congress chuckschumer nancypelosi democrat...,0,2.330813
1,2,#JoeBiden,"[defunding, police, conspiracy, theory, town, ...",who is defunding police conspiracy theories ...,0,1.0
2,3,#JoeBiden,"[joebiden, tell, world, another, whopper, get,...",joebiden tells the world another whopper you...,0,2.378689
3,4,#JoeBiden,"[democrat, future, nacypelosi, latin, future, ...",democrats future a nacypelosi latin s are ...,1,1.503812
4,5,#JoeBiden,"[taxreformer, big, surprise, irs, audit, larce...",taxreformer big surprise the irs doesn t aud...,1,1.621038


In [86]:
finder = nltk.collocations.BigramCollocationFinder.from_words(tokenize(text))
finder.ngram_fd.most_common(10)

[(('joe', 'biden'), 41),
 (('town', 'hall'), 22),
 (('president', 'joebiden'), 17),
 (('joebiden', 'trump'), 15),
 (('trump', 'presidenttrump'), 14),
 (('presidenttrump', 'news'), 14),
 (('news', 'voterfraud'), 14),
 (('voterfraud', 'chinavirus'), 14),
 (('chinavirus', 'covid'), 14),
 (('potus', 'joebiden'), 13)]

In [None]:
# think about blobbing the results

from nltk.probability import FreqDist
import itertools

def get_most_common_words(amount, column=politic_df['text_processed']):

    blob = column.tolist()

    blob = itertools.chain.from_iterable(blob)

    fdist = FreqDist(blob)

    common = dict(fdist.most_common(amount)) # review this number

    return list(common.keys())

In [None]:
top1000 = get_most_common_words(1000)

In [None]:
top1000

In [None]:
# add anything that was not included in that list

#spams.append('good morning happy friday mikepence even trust secret service protect jan th fear coup maga gallows ready still call donald trump personal friend mary trump need write book abt psycho mike penny morningjoe penny co rz sbyy')

In [None]:
# now can start developing the features for our model

from sklearn.feature_extraction.text import CountVectorizer

# first i will vectorize all of the words in each tweet by the most common words

bow_vect = CountVectorizer(vocabulary=most_common, max_features=1000)

# fit creates one entry for each different word seen

X = bow_vect.fit_transform(politic_df['blobbed']).toarray()

In [None]:
# we want to standardize the features for improving the model

from sklearn.preprocessing import StandardScaler

# create object

scaler = StandardScaler()

# fit

scaler.fit(X)

# transform 

X_scaled = scaler.transform(as_df)