# Politician Sentiment Analysis - Data Gathering

Name:       Devin Patel  
Class:      CS 588 - 01  
Term:       FA 22  
Project:    Determining party alignment based on 2016 Election tweets.  
File Purpose: To pull tweets and generate sensitivity and polarity values,
              then export all data to pickle and csv files.  


Tweets are collected based on status IDs compiled by Justin Littman, Laura Wrubel, and Daniel Kerchner
on the [Harvard Dataverse.](dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/PDI7IN)

## Initialization

In [1]:
# Imports
import tweepy                   # Twitter API
import json                     # Twitter API config
from textblob import TextBlob   # Sentiment Analysis
import re                       # Cleaning Tweets
import pandas as pd             # Organizing data
import numpy as np
import os.path


def authenticateTweepy(config_file_path):
    """
    Authenticates Tweepy API

    Returns:
        tweepy.api.API: API object
    """
    config = json.load(open(config_file_path, 'r'))
    authenticate = tweepy.OAuthHandler(config['KEY'], config['SECRET'])
    authenticate.set_access_token(config['TOKEN'], config['TOKENSECRET'])
    return tweepy.API(authenticate, wait_on_rate_limit=True)


api = authenticateTweepy(r"auth/config.json")
RANDOM_STATE = 12  # Initialization for methods needing a random_state value
TWEETS_AMOUNT = 20000 # Number of tweets to fetch


## Pulling Statuses

In [2]:
# Load status IDs from text files and import tweets
def pullStatuses(dataPath, num_tweets):
    # Tweepy.API.lookup_statuses() only allows up to 100 IDs to be queried,
    # so all the IDs must be broken up into 100-element lists.
    
    statuses = []
    failed_statuses = []
    with open(dataPath, 'r') as r:
        all_candidate_ids = [i.strip() for i in r.readlines()]
        listOfCandidateIds = [all_candidate_ids[i*100: (i+1)*100] for i in range((len(all_candidate_ids)+100-1) // 100)]
        i=0
        row_index = 1
        for candidate_ids in listOfCandidateIds:
            if i == num_tweets: break
            try:
                searched_statuses = api.lookup_statuses(id=list(candidate_ids), tweet_mode='extended')
                for status in searched_statuses:
                    row_index += 1
                    if i == num_tweets: break
                    if status.full_text[:3] == "RT ": continue # Skip tweet if it is a retweet
                    statuses.append(status)
                    i+=1

            except Exception as e:
                failed_statuses.append(f"[FAIL] File '{os.path.basename(dataPath)}'  Row {row_index}:  {e}")
                
            print(f"[JOB STATUS] '{os.path.basename(dataPath)}': {i}/{num_tweets}", end='\r')
    print()
    return statuses, failed_statuses

# Loops through to collect data
def collectStatusData(statuses, party):
    text_data = []
    favorite_data = []
    retweet_data = []
    party_data = []
    
    for tweet in statuses:
        text_data.append(tweet.full_text)
        favorite_data.append(tweet.favorite_count)
        retweet_data.append(tweet.retweet_count)
        party_data.append(party) # 0 = Republican, 1 = Democrat
        
    return text_data, favorite_data, retweet_data, party_data


# Start import code here

# Paths
export_data_path = r'Data/TweetData.pkl'
import_errors_path = r"Logs/import-errors.txt"

failed_statuses = []

# Grabs democrat 2016 candidate tweets
democrat_candidate_statuses, r_fail_statuses = pullStatuses(r"Data/democratic-candidate-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print(f"Completed Democrat Candidate Statuses\n\tStatuses Pulled: {len(democrat_candidate_statuses)}\n\tFailed Statuses: {len(r_fail_statuses)}\n")

# Grabs democrat 2016 party tweets
democrat_party_statuses, r_fail_statuses = pullStatuses(r"Data/democratic-party-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print(f"Completed Democrat Party Statuses\n\tStatuses Pulled: {len(democrat_party_statuses)}\n\tFailed Statuses: {len(r_fail_statuses)}\n")

# Grabs republican 2016 candidate tweets
republican_candidate_statuses, r_fail_statuses = pullStatuses(r"Data/republican-candidate-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print(f"Completed Republican Candidate Statuses\n\tStatuses Pulled: {len(republican_candidate_statuses)}\n\tFailed Statuses: {len(r_fail_statuses)}\n")

# Grabs republican 2016 party tweets
republican_party_statuses, r_fail_statuses = pullStatuses(r"Data/republican-party-timelines.txt", TWEETS_AMOUNT)
failed_statuses += r_fail_statuses
print(f"Completed Republican Party Statuses\n\tStatuses Pulled: {len(republican_party_statuses)}\n\tFailed Statuses: {len(r_fail_statuses)}\n")

# Loop through and collect data
text_data = []
favorite_data = []
retweet_data = []
party_data = []
r_data = []

r_data.append(collectStatusData(democrat_candidate_statuses, party=1))
r_data.append(collectStatusData(democrat_party_statuses, party=1))
r_data.append(collectStatusData(republican_candidate_statuses, party=0))
r_data.append(collectStatusData(republican_party_statuses, party=0))

for tup in r_data:
    text_data += tup[0]
    favorite_data += tup[1]
    retweet_data += tup[2]
    party_data += tup[3]

data = {'Likes': favorite_data,
        'Retweets': retweet_data,
        'Party': party_data }

df = pd.DataFrame(data)
df['Likes'] = df['Likes'].astype('int')
df['Retweets'] = df['Retweets'].astype('int')
df['Party'] = df['Party'].astype('int')

text_df = pd.DataFrame(text_data, columns=['Text'])

# Exports errors
if failed_statuses:
    with open(import_errors_path, 'w') as w:
        for status in failed_statuses:
            w.write(status+"\n\n")

[JOB STATUS] 'democratic-candidate-timelines.txt': 20000/20000
Completed Democrat Candidate Statuses
	Statuses Pulled: 20000
	Failed Statuses: 0

[JOB STATUS] 'democratic-party-timelines.txt': 10550/20000
Completed Democrat Party Statuses
	Statuses Pulled: 10550
	Failed Statuses: 0

[JOB STATUS] 'republican-candidate-timelines.txt': 20000/20000
Completed Republican Candidate Statuses
	Statuses Pulled: 20000
	Failed Statuses: 0

[JOB STATUS] 'republican-party-timelines.txt': 10993/20000

Rate limit reached. Sleeping for: 251


[JOB STATUS] 'republican-party-timelines.txt': 15402/20000
Completed Republican Party Statuses
	Statuses Pulled: 15402
	Failed Statuses: 0



## Clean text of irreleveant data

In [3]:
# Clean text
def cleanText(text):
    """
    Cleans text by removing @mentions, hashtag symbols, and hyperlinks.

    Args:
        text (string): Text to clean

    Returns:
        string: Cleaned text
    """
    text = re.sub(r"@[A-Za-z0-9]+", '', text)   # Removes @mentions
    text = re.sub(r"#", '', text)               # Removes hashtag symbol
    text = re.sub(r"https?:\/\/\S+", '', text)  # Removes hyperlinks
    
    return text

text_df['Text'] = text_df['Text'].apply(cleanText)


## Generate Sentiment Scores

In [4]:
# Get subjectivity and polarity
def getSubjectivity(text):
    """
    Generates subjectivity value.
    Subjectivity - how 'opinionated' some text is

    Args:
        text (string): Text to assess

    Returns:
        float: subjectivity
    """
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    """
    Generates polarity value.
    Polarity - how 'positive' or 'negative' some text is

    Args:
        text (string): Text to assess

    Returns:
        float: polarity
    """
    return TextBlob(text).sentiment.polarity

df['Subjectivity'] = text_df['Text'].apply(getSubjectivity).astype('float')
df['Polarity'] = text_df['Text'].apply(getPolarity).astype('float')

## Export data

In [5]:
# Data is finished cleaning, export to pkl file for quick access
df.to_pickle(export_data_path)
print(df)


       Likes  Retweets  Party  Subjectivity  Polarity
0       6031      2544      1      0.000000  0.000000
1       1260       594      1      0.766667  0.366667
2       7641      3505      1      0.600000  0.875000
3       4651      1476      1      0.500000  0.062500
4       1827       748      1      0.000000  0.000000
...      ...       ...    ...           ...       ...
65947    600       325      0      0.000000  0.000000
65948     23         9      0      0.300000  0.000000
65949    248       233      0      0.252083  0.125000
65950     13         8      0      0.000000  0.000000
65951     36        10      0      0.375000 -0.125000

[65952 rows x 5 columns]


In [6]:
# Export to CSV for viewable export
from csv import DictWriter

def export_dataframe_as_csv(df, outfile):
    with open(outfile, 'w', newline='', encoding='utf-8-sig') as csvfile:
        # Column headers
        fieldnames = df.columns.values.tolist() + ['Text']

        writer = DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Loops through every row of dataframe
        for i in df.index:
            row = df.iloc[i]
            row_out = {}
            
            """
            This step must be done with a for loop because
            'row' is of type 'pandas.Series' while
            'row_out' is of type 'dict'
            """
            # Loops through all columns to build output row
            for feature in fieldnames[:-1]: row_out[feature] = row[feature]
            
            # Adds text to the end of row columns
            row_out['Text'] = text_df.iloc[i]['Text']
            
            # Writes row to file
            writer.writerow(row_out)



# File path of csv file
outfile = r"Data/TweetData.csv"
export_dataframe_as_csv(df, outfile)
