# Connect to local mongo

In [1]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["abortion"]

# Preprocess

In [2]:
from nltk.tokenize import TweetTokenizer
import re
import unidecode

# TweetTokenizer basically unterstands arrows, smiley faces and weird punctuation
tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False)


def my_preprocess(text, keep_hashtags=True):
    toks = tokenizer.tokenize(text)
    ret = []
    for tok in toks:
        if tok == "#SemST" or tok == "semst":
            continue
        if tok[0] == "#" and not keep_hashtags:
            continue
        if tok[:4] == "http":
            continue
#         if tok[0] == "@":
#             continue
        if tok[0] == '\n':
            continue
#         if tok[0] == ',':
#             continue
        # removing numbers and punctuation
#         if tok.isnumeric():
#             continue
        ret.append(tok)
    return " ".join(ret)

In [19]:
list(db.abortion.find({"tweet.quoted_status": {"$exists": True}},{"tweet.id", "tweet.text", "tweet.full_text", "tweet.retweeted_status", "tweet.extended_tweet"})[:50])

[{'_id': ObjectId('5dc083fbe36ea093ac65d1a8'),
  'tweet': {'id': 1191445651126673412,
   'full_text': "Some anti-abortion leaders claim they're against criminalizing abortion with one side of their mouth , and yet do nothing about it with their actions . Others openly embrace sending women to jail . Watch new video now : #StopProsecutingAbortion"}},
 {'_id': ObjectId('5dc083fbe36ea093ac65d1ba'),
  'tweet': {'id': 1191445571329916928,
   'full_text': 'The really great part is how Dignity Health Care , a Catholic hospital chain , calls themselves " pro-life " because they won\'t provide contraception or abortion care .'}},
 {'_id': ObjectId('5dc083fbe36ea093ac65d208'),
  'tweet': {'id': 1191445190109626369,
   'full_text': '#WakeUpAmerica #WeThePeople #ForThePeople #KamalaHarris2020 #KamalaHarrisForThePeople #MondayMotivation #FairFight2020 #ElectionSecurity #VoteBlueToSaveAmerica #VoteBlue2020 #Vote #California #Iowa #DemocracyOnTrial #Democracy #Justice #ACA #Education #ClimateChange #

In [3]:
count = 0
for t in db.abortion.find({},{"tweet.id", "tweet.text", "tweet.full_text", "tweet.retweeted_status", "tweet.extended_tweet"}):
    text = ""
    tweet = t["tweet"]
    if "extended_tweet" in tweet and "full_text" in tweet["extended_tweet"]:
        text = tweet["extended_tweet"]["full_text"]
    elif "retweeted_status" in tweet and "extended_tweet" in tweet["retweeted_status"] and "full_text" in tweet["retweeted_status"]["extended_tweet"]:
        text = tweet["retweeted_status"]["extended_tweet"]["full_text"]
    elif("retweeted_status" in tweet and "full_text" in tweet["retweeted_status"]):
        text = tweet["retweeted_status"]["full_text"]
    elif("full_text" in tweet):
        text = tweet["full_text"]
    elif("retweeted_status" in tweet and "text" in tweet["retweeted_status"]):
        text = tweet["retweeted_status"]["text"]
    elif("text" in tweet):
        text = tweet["text"]
    else:
        print("--------------------------------------")
        print(tweet["id"])
        count += 1
#         print(tweet)
    db.abortion.find_one_and_update({"tweet.id": tweet["id"]}, {"$set": {"tweet.full_text": my_preprocess(text)}})

In [4]:
count

0

It takes a while to run

After it finished, run db.abortion.count({"tweet.full_text": {$exists: true}}) on mongo shell and it should say 613550

In [12]:
tweets = set()
count = 0
for t in db.abortion.find({},{"tweet.full_text"}):
    tweet = t['tweet']['full_text'].replace('\n','')
    if tweet[:2] == "rt" or tweet[:3] == " rt" or tweet == "\n" or tweet == "":
        count += 1
    else:
        tweets.add(tweet)

In [13]:
print(count)
len(list(tweets))

5848


141439

In [14]:
import re
def camel_case_split(identifier):
    if len(identifier) == 0:
        return []
    if identifier == "#SemST":
        return []
    if identifier[0] == '#':
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier[1:])
        return [m.group(0) for m in matches]
    else:
        return [identifier]

In [15]:
import random

random.seed(43)

for i in range(20):
    sample = random.sample(tweets, 2000)
    
    w = open("abortion_unlabeled_sample_{}.csv".format(i), 'w')
    for tweet in sample:
        tweet_prep = " ".join([" ".join(camel_case_split(word)) for word in tweet.split(" ")])
        w.write("{}\n".format(tweet_prep))
        tweets.remove(tweet)
    w.close()

In [16]:
len(tweets)

101439

In [30]:
complete_tweets = set()
for t in db.abortion.find().limit(10):
    print(t['tweet']["full_text"])

National security of the Country is more important than abortion , trans issues and party . Trump is acting like a puppet to Putin , why ? Trump loves all dictators why ? Trump never criticized Saudi King for khashogi murder why ?
The fictitious Labour account approach was used heavily in the #repealthe8th campaign . The purpose here was to lend left wing ' legitimacy ' to the anti-abortion campaign .
This week , the U . S . Ambassador to the United Nations objected to the use of the phrase " sexual and reproductive health " because it could " suggest a right to abortion . " Make no mistake : They are coming for women's freedom on a global scale .
Let ’ s stop calling it “ the fight over abortion ” & start calling it what it really is : The fight to grant pregnant people personhood . The fight to acknowledge pregnant people as deserving basic human rights . The fight to have the audacity to respect pregnant people ’ s autonomy .
The largest 40 Days for Life campaign ever has come to an

In [76]:
for t in db.abortion.find({"tweet.full_text": "took a closer look and realized darrell jackson a democrat and a pastor from the columbia area is also missing . so ..."}):
    print(t["tweet"]["retweeted_status"]["extended_tweet"]["full_text"])

Took a closer look and realized Darrell Jackson, a Democrat and a pastor from the Columbia area is also missing. So that's 9 Republicans and 6 Democrats here today at Senate Medical Affairs. https://t.co/SRDyAMENMs
Took a closer look and realized Darrell Jackson, a Democrat and a pastor from the Columbia area is also missing. So that's 9 Republicans and 6 Democrats here today at Senate Medical Affairs. https://t.co/SRDyAMENMs
Took a closer look and realized Darrell Jackson, a Democrat and a pastor from the Columbia area is also missing. So that's 9 Republicans and 6 Democrats here today at Senate Medical Affairs. https://t.co/SRDyAMENMs
Took a closer look and realized Darrell Jackson, a Democrat and a pastor from the Columbia area is also missing. So that's 9 Republicans and 6 Democrats here today at Senate Medical Affairs. https://t.co/SRDyAMENMs
Took a closer look and realized Darrell Jackson, a Democrat and a pastor from the Columbia area is also missing. So that's 9 Republicans and