In [1]:
# importing libraries for the API connection

import tweepy
from tweepy.auth import OAuthHandler
import pandas as pd

In [2]:
# generating a dict with the keys for the API connection

secrets_dict={}
secrets_file = open('tweepy-keys.txt')
for line in secrets_file:
  (key,value) = line.split(':')
  secrets_dict[key] = value[:-1]

In [3]:
# creating the API cursor

auth = tweepy.OAuthHandler(secrets_dict['API Key'], secrets_dict['API secret'])
auth.set_access_token(secrets_dict['Access token'], secrets_dict['Access secret'])
api = tweepy.API(auth)

In [4]:
def scrape(words, date_since, numtweet):

# Creating DataFrame using pandas
  db = pd.DataFrame(columns=['username', 'description', 'location', 'following',
              'followers', 'totaltweets', 'retweetcount', 'text', 'hashtags'])

# We are using .Cursor() to search through twitter for the required tweets.
# The number of tweets can be restricted using .items(number of tweets)
  tweets = tweepy.Cursor(api.search, q=words, lang="en",
            since=date_since, tweet_mode='extended').items(numtweet)

# .Cursor() returns an iterable object. Each item in
# the iterator has various attributes that you can access to
# get information about each tweet
  list_tweets = [tweet for tweet in tweets]

# Counter to maintain Tweet Count
  i = 1

# we will iterate over each tweet in the list for extracting information about each tweet
  for tweet in list_tweets:
    username = tweet.user.screen_name
    description = tweet.user.description
    location = tweet.user.location
    following = tweet.user.friends_count
    followers = tweet.user.followers_count
    totaltweets = tweet.user.statuses_count
    retweetcount = tweet.retweet_count
    hashtags = tweet.entities['hashtags']
  
# Retweets can be distinguished by a retweeted_status attribute,
# in case it is an invalid reference, except block will be executed
    try:
      text = tweet.retweeted_status.full_text
    except AttributeError:
      text = tweet.full_text
    hashtext = list()
    for j in range(0, len(hashtags)):
      hashtext.append(hashtags[j]['text'])

    # Here we are appending all the extracted information in the DataFrame
    ith_tweet = [username, description, location, following,
          followers, totaltweets, retweetcount, text, hashtext]
    db.loc[len(db)] = ith_tweet

  

# we will save our database as a CSV file.
  return db

In [33]:
# defining the politicians to scrape for

politic = ['#JoeBiden','#BernieSanders','#MikePence','#TedCruz']

In [5]:
data = pd.read_csv('data_07_22.csv')

In [205]:
# define forloop that scrapes the necessary tweets and concatenates them

import time # twitter limitates the amount of scrapping you can do, so we need to do timesleep

date = '2021-07-01' # enter initial date

numtweet = 500 # number of tweets per hashtag

# we need to create a dataframe to deposit the tweets that we scrape

columns = ['target','Unnamed: 0', 'username', 'description', 'location', 'following',
       'followers', 'totaltweets', 'retweetcount', 'text', 'hashtags'] # this are the columns that for the dataframe

politic_df = pd.DataFrame(dict(), columns=columns) # creation of the dataframe

# this is the loop for the scrapping of every politician

for element in politic:
    subdata = scrape(element, date, numtweet)
    subdata['target'] = element
    politic_df = politic_df.append(subdata)
    time.sleep(60 * 15) # need to take it easy!
    
print('Scraping has completed!')
    

Scraping has completed!


In [208]:
# politic_df.to_csv('data_07_22.csv')
# put counter on the string

In [6]:
politic_df = data

In [7]:
politic_df.dtypes

Unnamed: 0        int64
target           object
Unnamed: 0.1    float64
username         object
description      object
location         object
following         int64
followers         int64
totaltweets       int64
retweetcount      int64
text             object
hashtags         object
dtype: object

In [8]:
politic_df.head()

Unnamed: 0.2,Unnamed: 0,target,Unnamed: 0.1,username,description,location,following,followers,totaltweets,retweetcount,text,hashtags
0,0,#JoeBiden,,ReenyNY,Ma-DogMa-Comic-At-Law [insert inaccurate self-...,NY,2058,652,34395,0,"On #racist #JoeBiden, the illegally installed...","['racist', 'JoeBiden', 'ElectionHeistDemic']"
1,1,#JoeBiden,,SajidAnjumOakv1,News History Science Finance Technology Busine...,"Oakville, Ontario",489,143,4769,0,#JoeBiden @POTUS @KamalaHarris\nPublic officia...,"['JoeBiden', 'EidAdhaMubarak', 'EidMubarak']"
2,2,#JoeBiden,,criticsberate,⚠️ FOLLOW AT YOUR OWN RISK ⚠️,,1,1,466,0,Why work in #California or anywhere with #joeb...,"['California', 'joebiden', 'shoplifting', 'ste..."
3,3,#JoeBiden,,JimMaso02236687,Let's face it: #WikiLeaks exists because the m...,,197,1849,14759,0,Senile Idiot #JoeBiden Says the Quiet Part Out...,"['JoeBiden', '2a', 'gunrights', 'guncontrol', ..."
4,4,#JoeBiden,,Newslink7com,https://t.co/DH5ZdXbmak Read Know Talk Worldwi...,United States,51,810,123734,0,Joe Biden - Individual sanctions on Cuba regim...,"['Cuba', 'individual', 'JoeBiden', 'newslink7'..."


In [9]:
politic_df = politic_df[['target','text']]

In [10]:
# now we need to clean the tweets
# importing the necessaire packages


import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet')
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# the main cleaning step

def clean_up(s):
    element1 = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', s) # remove links
    element2 = re.sub('[^a-zA-Z0-9]', ' ', element1) # remove non character symbols
    element3 = re.sub('amp', '', element2) # twitter has &amp as a special character
    return (re.sub('\d+',' ',element3)).lower() # remove any digits and lowercase everything

In [12]:
# tokenize the text

def tokenize(s):
    return word_tokenize(s)

In [13]:
# categorize function to help the next function that is lemmatize

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ, 
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist

In [14]:
# lemmatize to reduce the word to it most radical form

def lemmatize(l):
  
    lem = WordNetLemmatizer()
    lemmatized = [lem.lemmatize(w,get_wordnet_pos(w)) for w in l]
    
    return lemmatized

In [15]:
# remove the stop words

def remove_stopwords(l):
    
    filtered_sentence = []
    
    for w in l:
        if w not in stopwords.words('english'):
            filtered_sentence.append(w)
    
    return filtered_sentence

In [16]:
politic_df['text_processed'] = politic_df['text'].apply(clean_up)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politic_df['text_processed'] = politic_df['text'].apply(clean_up)


In [17]:
politic_df['text_processed'] = politic_df['text_processed'].apply(tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politic_df['text_processed'] = politic_df['text_processed'].apply(tokenize)


In [18]:
politic_df['text_processed'] = politic_df['text_processed'].apply(lemmatize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politic_df['text_processed'] = politic_df['text_processed'].apply(lemmatize)


In [19]:
politic_df['text_processed'] = politic_df['text_processed'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politic_df['text_processed'] = politic_df['text_processed'].apply(remove_stopwords)


In [22]:
# think about blobbing the results

from nltk.probability import FreqDist
import itertools

blob = politic_df['text_processed'].tolist()

blob = itertools.chain.from_iterable(blob)

fdist = FreqDist(blob)

common = dict(fdist.most_common(1000)) # review this number

most_common = list(common.keys())

most_common

['trump',
 'joebiden',
 'tedcruz',
 'penny',
 'berniesanders',
 'mikepence',
 'biden',
 'mike',
 'maga',
 'get',
 'democrat',
 'need',
 'jan',
 'trust',
 'cnn',
 'secret',
 'still',
 'service',
 'even',
 'good',
 'morningjoe',
 'ted',
 'th',
 'donald',
 'foxnews',
 'call',
 'covid',
 'u',
 'gop',
 'coup',
 'write',
 'aoc',
 'say',
 'fear',
 'protect',
 'book',
 'texas',
 'friend',
 'gallows',
 'morning',
 'ready',
 'friday',
 'cruz',
 'abt',
 'happy',
 'personal',
 'mary',
 'psycho',
 'make',
 'state',
 'republican',
 'bernie',
 'kevinmccarthy',
 'msnbc',
 'usa',
 'car',
 'gregabbott',
 'like',
 'want',
 'political',
 'rondesantis',
 'america',
 'block',
 'confirm',
 'variant',
 'kristinoem',
 'delta',
 'billlee',
 'dougducey',
 'mikeparson',
 'president',
 'point',
 'potus',
 'go',
 'blm',
 'joe',
 'lie',
 'move',
 'harm',
 'texan',
 'right',
 'sen',
 'warn',
 'laurenboebert',
 'would',
 'people',
 'tool',
 'blacklivesmatter',
 'know',
 'nominee',
 'lindseygraham',
 'marcorubio',
 'cu

In [24]:
# clean for the spams

# first we need to filter what is spam and what its not

def blobing(row):
    return " ".join(row)

politic_df['blobbed'] = politic_df['text_processed'].apply(blobing)

unique_tweets = dict(politic_df['blobbed'].value_counts()) 

spams = []

for x,y in unique_tweets.items():
    if y > 1:
        spams.append(x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politic_df['blobbed'] = politic_df['text_processed'].apply(blobing)


In [25]:
politic_df['blobbed'].value_counts()

good morning happy friday mikepence even trust secret service protect jan th fear coup maga gallows ready still call donald trump personal friend mary trump need write book abt psycho mike penny morningjoe penny    147
move democrat republican warn harm u diplomacy sen ted cruz block nominee confirm vital state department role make political point ted absolute tool fvck tedcruz fatwolverine                                          60
mike penny trust secret service part trump conspiracy capitolattack secretservice mikepence                                                                                                                             24
coward mike penny use faith crutch witness trump evil never say much hypocrite silent die remain silent still trump lie penny mikepence good safe                                                                       23
january th mikepence refuse get secret service car trust get car trust tim drive car get vehicle guy take get car           

In [32]:
len(spams)

266

In [None]:
# add anything that was not included in that list

#spams.append('good morning happy friday mikepence even trust secret service protect jan th fear coup maga gallows ready still call donald trump personal friend mary trump need write book abt psycho mike penny morningjoe penny co rz sbyy')

In [28]:
# this line of code will delete the spams

counter = 0

politic_df = politic_df.reset_index()

for element in politic_df['blobbed']:
    if element in spams:
        politic_df.drop(index=counter, inplace=True)
    counter += 1

In [29]:
politic_df['blobbed'].value_counts()

joebiden need stop say serious really mean damn president stop qualifier tire old stop answer real question fact frickin year bro                                                                                                1
secondstomars jaredleto shannonleto thought guy berniesanders second bernie secondstomars thirtysecondstomars jaredleto shannonleto                                                                                              1
bernie still win thread electionnight election election bernie berniesanders                                                                                                                                                     1
dom family everything share dom clutch dom troptrev domtoretto domtorettomeme joebiden ppp funny meme tiktok fastandfurious fast trend cali                                                                                      1
beyond putin wild dream kremlin backing trump goal alter mass consciousness certain group go

In [30]:
len(politic_df)

907

In [31]:
data.shape

(2000, 12)

In [33]:
# now can start developing the features for our model

from sklearn.feature_extraction.text import CountVectorizer

# first i will vectorize all of the words in each tweet by the most common words

bow_vect = CountVectorizer(vocabulary=most_common, max_features=1000)

# fit creates one entry for each different word seen

X = bow_vect.fit_transform(politic_df['blobbed']).toarray()

In [None]:
# we want to standardize the features for improving the model

from sklearn.preprocessing import StandardScaler

# create object

scaler = StandardScaler()

# fit

scaler.fit(X)

# transform 

X_scaled = scaler.transform(as_df)