# Twitter Access 

## Twitter API access resources

- https://developer.twitter.com/en/docs/twitter-api/v1/rules-and-filtering/search-operators
- https://docs.tweepy.org/en/latest/
- https://docs.tweepy.org/en/latest/api.html#search-methods

In [None]:
# Installing package 'Tweepy' 

pip install -- tweepy

In [None]:
import os
import tweepy as tw
import pandas as pd

In [None]:
# Access token for twitter API 
# appname <- "LSHTM_DC_2021"
# access keys and secret removed for public display use 

consumer_key = " "
consumer_secret = " "

access_token = " "
access_token_secret = " "

In [None]:
# Authenicating connection 

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

### Twitter general search

In [None]:
search_words = "lockdown" + " -filter:retweets" + " - filter:media"
result_type = "mixed"
date_since = "2021-01-21"
until = "2021-01-26"

In [None]:
# Creating a cursor to access the API 
# searching tweets 
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since_id=date_since, 
              until = until,
              result_type = result_type,
                  tweet_mode="extended").items(500)
tweets

In [None]:
# picking the columns to display 
# display as a dataframe 
users_locs = [[tweet.id, tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at] for tweet in tweets]
tweet_df = pd.DataFrame(data=users_locs, columns=['tweet_id','text', 'user', "location", "date"])
tweet_df

In [None]:
tweet_text.to_csv('/Users/eleanordavies/Desktop/tweet_lockdown_demo.csv', index = False) 

### Twitter timeline search

In [None]:
user_name = "BorisJohnson"
result_type = "mixed"
date_since = "2018-01-01"

In [None]:
bojo = tw.Cursor(api.user_timeline,
              screen_name= user_name,
              lang="en",
              since_id=date_since, 
              result_type = result_type,
              exclude_replies = True,
              include_rts = False,
                  tweet_mode="extended").items(5000)
bojo

In [None]:
users_timeline_locs = [[tweet.id, tweet.full_text, tweet.user.screen_name, tweet.user.location, tweet.created_at] for tweet in bojo]
bojo_df = pd.DataFrame(data=users_timeline_locs, columns=['tweet_id','text', 'user', "location", "date"])
bojo_df

# Data Cleaning 

## Resources used for pre-processing 

- https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

* Remove URLs from the tweets
* Tokenize text
* Remove emails
* Remove new lines characters
* Remove distracting single quotes
* Remove all punctuation signs
* Lowercase all text
* Detokenize text
* Convert list of texts to Numpy array

In [None]:
# Importing relevant libraries 
import numpy as np
import re
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim
from gensim.utils import simple_preprocess
print('Done')

In [None]:
# Load data 

tweet_text = pd.read_csv('/Users/eleanordavies/Desktop/tweet_lockdown_demo.csv')

In [None]:
# Creating a function to remove characters 

def depure_data(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

In [None]:
temp = []
#Splitting twtter data text column to list
data_to_list = tweet_text['text'].values.tolist()

#applying the above function 
for i in range(len(data_to_list)):
    temp.append(depure_data(data_to_list[i]))

tempdf = pd.DataFrame(temp)
tempdf.head()

In [None]:
# Detokenize removes all punctation, emojis and puts text into lowercase 

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
data_words = list(sent_to_words(temp))

print(data_words[:10])

In [None]:
def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [None]:
data_detoken = []
for i in range(len(data_words)):
    data_detoken.append(detokenize(data_words[i]))
data_detoken = np.array(data_detoken)
data_detoken_df = pd.DataFrame(data_detoken)
data_detoken_df

In [None]:
data_detoken_df.to_csv('/Users/eleanordavies/Desktop/df.csv', index = False)

In [None]:
frames = [data_detoken_df,tweet_text]
demo_tweets_clean = pd.concat(frames, axis=1)

demo_tweets_clean = demo_tweets_clean.rename(columns={0: "selected_tweets"})

In [None]:
demo_tweets_clean.to_csv('/Users/eleanordavies/Desktop/demo_tweets_clean.csv', index = False)