In [17]:
import os
import json
import re
import csv
from datetime import datetime

start_time = datetime.now()

In [18]:
def remove_emojis_and_links(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]|RT', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace("\n", "")
    return text.strip()

In [19]:
path = os.path.join("..", "..", "DATA")

dem_accs_filepath = os.listdir(os.path.join(path, "dem"))
rep_accs_filepath = os.listdir(os.path.join(path, "rep"))
    
print(f"{len(dem_accs_filepath)} democrat accounts")
print(f"{len(rep_accs_filepath)} republican accounts")

266 democrat accounts
9135 republican accounts


In [28]:
def get_tweets(accs_filepath, party):
    tweets = []
    for i in range(len(accs_filepath)):
        tweets_ = os.listdir(os.path.join(path, party, accs_filepath[i]))
        for j in range(len(tweets_)):
            try:
                data = json.load(open(os.path.join(path, party, accs_filepath[i], tweets_[j])))
                tweets.append(data["rawContent"])
            except:
                print(f"Error with {os.path.join(path, party, accs_filepath[i], tweets_[j])}")
    return tweets

In [29]:
dem_tweets = []
rep_tweets = []

dem_tweets = get_tweets(dem_accs_filepath, "dem")
rep_tweets = get_tweets(rep_accs_filepath, "rep")

print(f"{len(dem_tweets)} democrat tweets, {len(dem_tweets) / len(dem_accs_filepath)} tweets per account")
print(f"{len(rep_tweets)} republican tweets, {len(rep_tweets) / len(rep_accs_filepath)} tweets per account")

Error with tweet_1777428755675320787.json
0 democrat tweets, 0.0 tweets per account
279576 republican tweets, 30.604926108374386 tweets per account


In [30]:
def clean_tweets(tweets):
    cleaned_tweets = []
    for tweet in tweets:
        cleaned = remove_emojis_and_links(tweet)
        if cleaned.strip() != "":
            cleaned_tweets.append(cleaned)
    return cleaned_tweets

In [31]:
dem_tweets_cleaned = []
rep_tweets_cleaned = []

dem_tweets_cleaned = clean_tweets(dem_tweets)
rep_tweets_cleaned = clean_tweets(rep_tweets)

print(f"{len(dem_tweets_cleaned)} democrat tweets, {len(dem_tweets) - len(dem_tweets_cleaned)} removed")
print(f"{len(rep_tweets_cleaned)} republican tweets, {len(rep_tweets) - len(rep_tweets_cleaned)} removed")

0 democrat tweets, 0 removed
269424 republican tweets, 10152 removed


In [32]:
def pick_long_tweets(tweets, length):
    long_tweets = []
    for tweet in tweets:
        if len(tweet.split()) > length:
            long_tweets.append(tweet)
    return long_tweets

In [33]:
dem_tweets_long = []
rep_tweets_long = []

dem_tweets_long = pick_long_tweets(dem_tweets_cleaned, 10)
rep_tweets_long = pick_long_tweets(rep_tweets_cleaned, 10)

print(f"{len(dem_tweets_long)} democrat tweets longer than 10 words, {len(dem_tweets_cleaned) - len(dem_tweets_long)} removed")
print(f"{len(rep_tweets_long)} republican tweets longer than 10 words, {len(rep_tweets_cleaned) - len(rep_tweets_long)} removed")

0 democrat tweets longer than 10 words, 0 removed
207921 republican tweets longer than 10 words, 61503 removed


In [34]:
print(f"Final amount of democrat tweets: {len(dem_tweets_long)}, {len(dem_tweets_long) / len(dem_accs_filepath)} tweets per account, total removed {len(dem_tweets) - len(dem_tweets_long)}")
print(f"Final amount of republican tweets: {len(rep_tweets_long)}, {len(rep_tweets_long) / len(rep_accs_filepath)} tweets per account, total removed {len(rep_tweets) - len(rep_tweets_long)}")

Final amount of democrat tweets: 0, 0.0 tweets per account, total removed 0
Final amount of republican tweets: 207921, 22.760919540229885 tweets per account, total removed 71655


In [35]:
dem_tweets_withoutdublicates = []
rep_tweets_withoutdublicates = []

dem_tweets_withoutdublicates = list(set(dem_tweets_long))
rep_tweets_withoutdublicates = list(set(rep_tweets_long))

print(f"Final amount of democrat tweets without dublicates: {len(dem_tweets_withoutdublicates)}, {len(dem_tweets_withoutdublicates) / len(dem_accs_filepath)} tweets per account, total removed {len(dem_tweets_long) - len(dem_tweets_withoutdublicates)}")
print(f"Final amount of republican tweets without dublicates: {len(rep_tweets_withoutdublicates)}, {len(rep_tweets_withoutdublicates) / len(rep_accs_filepath)} tweets per account, total removed {len(rep_tweets_long) - len(rep_tweets_withoutdublicates)}")

Final amount of democrat tweets without dublicates: 0, 0.0 tweets per account, total removed 0
Final amount of republican tweets without dublicates: 147470, 16.143404488232076 tweets per account, total removed 60451


In [36]:
def save(tweets, party):
    with open(f"{party}.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["text"])
        for tweet in tweets:
            writer.writerow([tweet])

In [37]:
save(dem_tweets_withoutdublicates, "democrat")
save(rep_tweets_withoutdublicates, "republican")

print(f"Time taken: {datetime.now() - start_time}")