# Politician Sentiment Analysis - Preprocessing

Name:       Devin Patel  
Class:      CS 588 - 01  
Term:       FA 22  
Project:    Determining party alignment based on 2016 Election tweets.  
File Purpose: To pull tweets and generate sensitivity and polarity values,
              then export all data to a pickle file.

## Initialization

In [5]:
# Imports
import tweepy                   # Twitter API
import json                     # Twitter API config
from textblob import TextBlob   # Sentiment Analysis
import re                       # Cleaning Tweets
import pandas as pd             # Organizing data
import numpy as np


def authenticateTweepy(config_file_path):
    """
    Authenticates Tweepy API

    Returns:
        tweepy.api.API: API object
    """
    config = json.load(open(config_file_path, 'r'))
    authenticate = tweepy.OAuthHandler(config['KEY'], config['SECRET'])
    authenticate.set_access_token(config['TOKEN'], config['TOKENSECRET'])
    return tweepy.API(authenticate, wait_on_rate_limit=True)


api = authenticateTweepy(r"auth/config.json")
RANDOM_STATE = 12  # Initialization for methods needing a random_state value
TWEETS_AMOUNT = 110 # Number of tweets to fetch


In [12]:
# Load status IDs from text files and import tweets
def importStatuses(dataPath, num_tweets):
    statuses = []
    failed_statuses = []
    with open(dataPath, 'r') as r:
        all_candidate_ids = [i.strip() for i in r.readlines()]
        listOfCandidateIds = [all_candidate_ids[i*100: (i+1)*100] for i in range((len(all_candidate_ids)+100-1) // 100)]
        i=0
        for candidate_ids in listOfCandidateIds:
            if i == num_tweets: break
            try:
                searched_statuses = api.lookup_statuses(id=list(candidate_ids), tweet_mode='extended')
                for status in searched_statuses:
                    if i == num_tweets: break
                    if status.full_text[:3] == "RT ": raise Exception("Status is a retweet.") # Skip tweet if it is a retweet
                    statuses.append(status)
                    i+=1
            except Exception as e:
                failed_statuses.append(f"[FAIL] id='{id}':  {e}")
                
            print(f"[JOB STATUS] '{dataPath}': {i}/{num_tweets}", end='\r')
    print()
    return statuses, failed_statuses

# Loops through to collect data
def collectStatusData(statuses, party):
    text_data = []
    favorite_data = []
    retweet_data = []
    party_data = []
    
    for tweet in statuses:
        text_data.append(tweet.full_text)
        favorite_data.append(tweet.favorite_count)
        retweet_data.append(tweet.retweet_count)
        party_data.append(party) # 0 = Republican, 1 = Democrat
        
    return text_data, favorite_data, retweet_data, party_data


# Start import code here

# Paths
export_data_path = r'Data/TweetData.pkl'
import_errors_path = r"Logs/import-errors.txt"

failed_statuses = []

# Grabs democrat 2016 candidate tweets
democrat_candidate_statuses, r_fail_statuses = importStatuses(r"Data/democratic-candidate-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print("Completed Democrat Candidate Statuses\n")

# Grabs democrat 2016 party tweets
democrat_party_statuses, r_fail_statuses = importStatuses(r"Data/democratic-party-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print("Completed Democrat Party Statuses\n")

# Grabs republican 2016 candidate tweets
republican_candidate_statuses, r_fail_statuses = importStatuses(r"Data/republican-candidate-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print("Completed Republican Candidate Statuses\n")

# Grabs republican 2016 party tweets
republican_party_statuses, r_fail_statuses = importStatuses(r"Data/republican-party-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print("Completed Republican Party Statuses\n")

# Loop through and collect data
text_data = []
favorite_data = []
retweet_data = []
party_data = []
r_data = []

r_data.append(collectStatusData(democrat_candidate_statuses, party=1))
r_data.append(collectStatusData(democrat_party_statuses, party=1))
r_data.append(collectStatusData(republican_candidate_statuses, party=0))
r_data.append(collectStatusData(republican_party_statuses, party=0))

for tup in r_data:
    text_data += tup[0]
    favorite_data += tup[1]
    retweet_data += tup[2]
    party_data += tup[3]

data = {'Likes': favorite_data,
        'Retweets': retweet_data,
        'Party': party_data }

df = pd.DataFrame(data)
text_df = pd.DataFrame(text_data, columns=['Text'])

# Exports errors
with open(import_errors_path, 'w') as w:
    for status in failed_statuses:
        w.write(status+"\n")

[JOB STATUS] 'Data/democratic-candidate-timelines.txt': 74/110

KeyboardInterrupt: 

In [None]:
# Clean text
def cleanText(text):
    """
    Cleans text by removing @mentions, hashtag symbols, and hyperlinks.

    Args:
        text (string): Text to clean

    Returns:
        string: Cleaned text
    """
    text = re.sub(r"@[A-Za-z0-9]+", '', text)   # Removes @mentions
    text = re.sub(r"#", '', text)               # Removes hashtag symbol
    text = re.sub(r"https?:\/\/\S+", '', text)  # Removes hyperlinks
    
    return text

text_df['Text'] = text_df['Text'].apply(cleanText)


In [None]:
# Get subjectivity and polarity
def getSubjectivity(text):
    """
    Generates subjectivity value.
    Subjectivity - how 'opinionated' some text is

    Args:
        text (string): Text to assess

    Returns:
        float: subjectivity
    """
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    """
    Generates polarity value.
    Polarity - how 'positive' or 'negative' some text is

    Args:
        text (string): Text to assess

    Returns:
        float: polarity
    """
    return TextBlob(text).sentiment.polarity

df['Subjectivity'] = text_df['Text'].apply(getSubjectivity)
df['Polarity'] = text_df['Text'].apply(getPolarity)

In [None]:
# Data is finished cleaning, export to pkl file
pd.to_pickle(df, open(export_data_path, 'w'))

print(df)