# Imports

In [1]:
import pandas as pd
import fasttext

from nltk.tokenize import TweetTokenizer
import re
import unidecode

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords

import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score, recall_score, f1_score

from scipy import sparse

# Collect Data

I couldn't import the data as directly downloaded from semeval because it gave me an "unknown not utf-8 character" so I imported it as a csv to an Excel and saved it again as train_utf.csv and it worked

In [116]:
traindata = pd.read_csv("semeval.abortion.train.csv", sep=',', encoding="latin1").fillna(method="ffill")
testdata = pd.read_csv("semeval.abortion.test.csv", sep=',', encoding="latin1").fillna(method="ffill")
validationdata = pd.read_csv("semeval.abortion.validation.csv", sep=',', encoding="latin1").fillna(method="ffill")

is_against = traindata['Stance']=='AGAINST'
is_favor = traindata['Stance']=='FAVOR'
is_none = traindata['Stance']=='NONE'

traindata_against = traindata[is_against]
traindata_favor = traindata[is_favor]
traindata_none = traindata[is_none]

## Keep only the abortion tweets ONLY FOR COMPLETE DATASET
# is_abortion = traindata['Stance']=="Legalization of Abortion"
# is_abortion_test = testdata['Stance']=="Legalization of Abortion"
# is_abortion_valid = validationdata['Stance']=="Legalization of Abortion"

# train_abortion = traindata[is_abortion]
# test_abortion = testdata[is_abortion_test]
# valid_abortion = validationdata[is_abortion_test]

In [131]:
traindata = pd.concat([traindata_against[:99], traindata_none[:99], traindata_favor])

In [132]:
traindata

Unnamed: 0,Tweet,Stance
0,Just laid down the law on abortion in my bioet...,AGAINST
2,"Now that there's marriage equality, can we sta...",AGAINST
3,I'll always put all my focus and energy toward...,AGAINST
4,"@BarackObama celebrates ""equality"" while 3000 ...",AGAINST
6,Now that the govt and The Supremes have saved ...,AGAINST
8,In the aim for equality we have forgotten the ...,AGAINST
12,"True equality"" allows all to be born. #SemST",AGAINST
14,Can we make sure #lovewins for babies too? Or ...,AGAINST
15,What about the equal rights of unborn humans i...,AGAINST
24,Okay so how about declaring and protecting equ...,AGAINST


# Preprocessing

In [133]:

# TweetTokenizer basically unterstands arrows, smiley faces and weird punctuation
tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False)


def my_preprocess(text, keep_hashtags=True):
    toks = tokenizer.tokenize(text)

    ret = []
    for tok in toks:
        if tok[:4] == "#sem":
            continue
        if tok[0] == "#" and not keep_hashtags:
            continue
        if tok[:4] == "http":
            continue
        if tok[0] == "@":
            continue
        # removing numbers
        if tok.isnumeric():
            continue
        ret.append(unidecode.unidecode(tok.lower()))
    return " ".join(ret)

In [135]:

## Train
traindata['Text'] = traindata['Tweet'].apply(lambda x: my_preprocess(x))

# Shifts the order on the original tweet list
sample_train = traindata.sample(frac=1.0)

text_train, label_train = sample_train['Text'], sample_train['Stance']




## Validation
validationdata['Text'] = validationdata['Tweet'].apply(lambda x: my_preprocess(x))

# Shifts the order on the original tweet list
sample_valid = validationdata.sample(frac=1.0)

text_valid, label_valid = sample_valid['Text'], sample_valid['Stance']



## Test
testdata['Text'] = testdata['Tweet'].apply(lambda x: my_preprocess(x))

sample_test = testdata.sample(frac=1.0)

text_test, label_test = sample_test['Text'], sample_test['Stance']

print(list(zip(text_train, label_train)))

[("manipulation , disguise , & evil is always seen by the out-come . the ugly doors of satan appear in all forms when it's fake love . #semst", 'NONE'), ("don't see the big deal about all this #semst", 'AGAINST'), ("i'm done with the convo . but before i go . #semst", 'NONE'), ('in church for a pro-life meeting . i will be the voice for the unborn . #savethebabiesbumpthemanimals #godislovesolovewins ! ! #semst', 'AGAINST'), ("oh look ! ! ! so not only are antichoice strongly against pregnant people's human rights , they're also homophobic . shocker . ( not ) #semst", 'FAVOR'), ('selfish men everywhere : " #birthcontrolhelpedme cheat on my wife ! but she found out anyway & our marriage was destroyed . #semst', 'NONE'), ('undergirding every important issue today is this : what is truth ? #marriage #racism #gender #semst', 'NONE'), ("can we get a law for the little ones who can't even speak for themselves ? #prolifeyouth #everylifematters #gay #straight #baby #semst", 'AGAINST'), ('what i

# Vectorize

Using a word count Matrix

In [136]:

vectorizer = CountVectorizer(
    binary=True, min_df=0.002, max_df=0.55, ngram_range=(1, 4),
    #stop_words=stopwords.words('spanish')
)

X = vectorizer.fit_transform(pd.concat([text_train, text_valid, text_test]))
X_train = X[:653]
X_test = X[653:]

# Fasttext

### Create a csv file with the label format that fasttext needs

In [137]:
with open("fasttext_train.txt", 'w') as w:
    for (text, label) in zip(text_train, label_train):
        w.write("{} __label__{}\n".format(text, label))
w.close()

### Train

In [164]:
# Find the best number of epochs in order not to overfit
best_epoch = 0
best_f1 = 0
for epoch in [20000, 21000, 22000, 23000]:    
    model = fasttext.train_supervised('fasttext_train.txt', lr=0.001, dim=500, epoch=epoch)
    
    # Evaluate
    tweet_selected = []
    predictions = []
    label_valid_selection = []
    for tweet, label in zip(text_valid, label_valid):
        prediction = model.predict(tweet)
        if prediction[1][0] > 0.95:
            tweet_selected.append(tweet)
            predictions.append(prediction)
            label_valid_selection.append(label)
            print(tweet)
            print(prediction)
            print(label)
    pred_labels = [pred[0][0].replace('__label__','') for pred in predictions]
#     print(list(zip(label_valid_selection, pred_labels)))
    current_f1 = f1_score(label_valid_selection, pred_labels, average="macro", labels=['AGAINST','FAVOR','NONE'])
    if current_f1 > best_f1:
        best_epoch = epoch
        best_f1 = current_f1

#prolifeyouth know that human life = human life , inside the womb or out . #semst
(('__label__AGAINST',), array([0.99932492]))
AGAINST
#love does not delight in #evil , but #rejoices in the #truth ~ cor 3:6 #loving requires always speaking truth #protestchildkilling #semst
(('__label__NONE',), array([0.95608288]))
AGAINST
why don't black lives matter in the womb ? #semst
(('__label__NONE',), array([0.99782288]))
AGAINST
rt : when is abortion a responsible choice ? when a woman chooses it to be #semst
(('__label__FAVOR',), array([0.98527658]))
FAVOR
anti-vaxxers are such an idiotic bunch ... seriously , guys . vaccinate . your . kids . #stopsb277 #conformcomics #marvel #dc #semst
(('__label__FAVOR',), array([0.9846729]))
NONE
the causes & circumstances of pregnancies vary , but one thing is unwavering : a human with a right to life . #prolifeyouth #semst
(('__label__AGAINST',), array([0.99796772]))
AGAINST
only if u want me to smash ur head through a solid wall made of steel each time u

#prolifeyouth know that human life = human life , inside the womb or out . #semst
(('__label__AGAINST',), array([0.99938589]))
AGAINST
#love does not delight in #evil , but #rejoices in the #truth ~ cor 3:6 #loving requires always speaking truth #protestchildkilling #semst
(('__label__NONE',), array([0.95807123]))
AGAINST
why don't black lives matter in the womb ? #semst
(('__label__NONE',), array([0.99800712]))
AGAINST
rt : when is abortion a responsible choice ? when a woman chooses it to be #semst
(('__label__FAVOR',), array([0.98614264]))
FAVOR
anti-vaxxers are such an idiotic bunch ... seriously , guys . vaccinate . your . kids . #stopsb277 #conformcomics #marvel #dc #semst
(('__label__FAVOR',), array([0.9854008]))
NONE
i still love you no matter who you love #semst
(('__label__NONE',), array([0.95017076]))
NONE
the causes & circumstances of pregnancies vary , but one thing is unwavering : a human with a right to life . #prolifeyouth #semst
(('__label__AGAINST',), array([0.9981471

#prolifeyouth know that human life = human life , inside the womb or out . #semst
(('__label__AGAINST',), array([0.99938965]))
AGAINST
#love does not delight in #evil , but #rejoices in the #truth ~ cor 3:6 #loving requires always speaking truth #protestchildkilling #semst
(('__label__NONE',), array([0.96131867]))
AGAINST
why don't black lives matter in the womb ? #semst
(('__label__NONE',), array([0.99816996]))
AGAINST
rt : when is abortion a responsible choice ? when a woman chooses it to be #semst
(('__label__FAVOR',), array([0.98651755]))
FAVOR
anti-vaxxers are such an idiotic bunch ... seriously , guys . vaccinate . your . kids . #stopsb277 #conformcomics #marvel #dc #semst
(('__label__FAVOR',), array([0.98604816]))
NONE
i still love you no matter who you love #semst
(('__label__NONE',), array([0.95151585]))
NONE
the causes & circumstances of pregnancies vary , but one thing is unwavering : a human with a right to life . #prolifeyouth #semst
(('__label__AGAINST',), array([0.998191

#prolifeyouth know that human life = human life , inside the womb or out . #semst
(('__label__AGAINST',), array([0.99942601]))
AGAINST
#love does not delight in #evil , but #rejoices in the #truth ~ cor 3:6 #loving requires always speaking truth #protestchildkilling #semst
(('__label__NONE',), array([0.96263593]))
AGAINST
why don't black lives matter in the womb ? #semst
(('__label__NONE',), array([0.99829727]))
AGAINST
rt : when is abortion a responsible choice ? when a woman chooses it to be #semst
(('__label__FAVOR',), array([0.98716581]))
FAVOR
anti-vaxxers are such an idiotic bunch ... seriously , guys . vaccinate . your . kids . #stopsb277 #conformcomics #marvel #dc #semst
(('__label__FAVOR',), array([0.98668534]))
NONE
i still love you no matter who you love #semst
(('__label__NONE',), array([0.95354956]))
NONE
the causes & circumstances of pregnancies vary , but one thing is unwavering : a human with a right to life . #prolifeyouth #semst
(('__label__AGAINST',), array([0.998326

In [168]:
model = fasttext.train_supervised('fasttext_train.txt', lr=0.001, dim=500, epoch=best_epoch)
best_f1

# list(zip(tweet_selected, predictions, label_valid_selection))

0.7856209150326796

### Test

In [169]:
predictions = []
label_test_selection = []
for tweet, label in zip(text_test, label_test):
    prediction = model.predict(tweet)
    if prediction[1] > 0.95:
        print(tweet)
        print(prediction)
        print(label)
        predictions.append(prediction[0][0].replace('__label__', ''))
        label_test_selection.append(label)

print(predictions)
print(label_test_selection)

would you rather have women taking dangerous concoctions to induce abortions or know they are getting a safe & legal one ? #semst
(('__label__FAVOR',), array([0.99871635]))
FAVOR
i'm curious mary are you a meat eater and an animal hide wearer ? #semst
(('__label__FAVOR',), array([0.98899126]))
NONE
people don't care about things that don't effect men which is the same reason it's sexist we don't cover birth control #semst
(('__label__FAVOR',), array([0.99890709]))
AGAINST
where is #silenceddiscourse for #divisions on #schoolchoice #taxes #doublestandard #biased #semst
(('__label__NONE',), array([0.99900788]))
AGAINST
#rally4life because if they don't want an abortion , then no one can have the choice #semst
(('__label__FAVOR',), array([0.97738177]))
AGAINST
a bundle of cells feels more pain than a fully grown women ? no . #semst
(('__label__FAVOR',), array([0.99326998]))
AGAINST
pregnant people have more than heartbeats . they have feelings , and the ability to make decisions about the

In [170]:
f1_score(label_test_selection, predictions, average="macro", labels=['AGAINST','FAVOR','NONE'])

# With 340 epochs an lr = 0.01, best result so far:
# micro: 0.5607142857142857, macro: 0.4753850446808366, per_class=[0.69411765, 0.41860465, 0.31343284]

0.5153928646160285

In [35]:
for (text, true, pred) in zip(text_test, label_test, pred_labels):
    print("{}... {} vs {}".format(text, true, pred))

mathewsjeanne thank you your very #kind rt . god bless you #patriot . #babies matter #semst... AGAINST vs AGAINST
good thing i don't live in kingston woulda started a riot by now with this ' pro-life ' protest #semst... FAVOR vs AGAINST
so you've blown right past the whole " women stopped dying after rvw " thing . #prolifelie #hashtagitall #semst... AGAINST vs AGAINST
like i said i'm glad this medical procedure is around so that pregnant people stay healthy . #semst... NONE vs FAVOR
. perhaps you want to go to the fertility clinics & rescue them all . how many can you fit in your womb . #clowncar #semst... AGAINST vs AGAINST
not your uterus , not your choice--from conception on , it really is that simple ! #fem2gen #yesallwomen #semst... FAVOR vs NONE
& purely physical enjoyment with no responsibility attached #semst... NONE vs NONE
we do care about the hurt inflicted upon preborn children through #abortion . #truthtour #ottawa #semst... AGAINST vs AGAINST
leftist , progressives love 2

## Fasttext Embeddings

###  Generate training file as needed by the fasttext lib

In [30]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["abortion"]

### Filter retweets

In [23]:
# w = open("fasttext_train_unsupervised.txt", 'w')
# unique_tweets = set()
# for t in db.abortion.find({},{"tweet.full_text"}):
#     unique_tweets.add(t['tweet']['full_text'])

# for text in unique_tweets:
#     w.write("{}\n".format(text))

# for text in text_train:
#     w.write("{}\n".format(text))
    
# w.close()

In [48]:
len(unique_tweets)

154726

In [None]:
model_vectors = fasttext.train_unsupervised('fasttext_train_unsupervised.txt', lr=0.01, dim=300, epoch=500)

In [239]:
train_embeds = [[0] * 300] * len(text_train)
for idx, tweet in enumerate(text_train):
    for word in tweet:
        train_embeds[idx] += model_vectors[word]


In [240]:
test_embeds = [[0] * 300] * len(text_test)
for idx, tweet in enumerate(text_test):
    for word in tweet:
        test_embeds[idx] += model_vectors[word]


# Logistic Regression

## With word count matrix

In [6]:
clf = LogisticRegression(penalty='l1', class_weight='balanced', C=1.0, n_jobs=-1)

clf.fit(X_train, label_train)

clf.score(X_train, label_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.9127105666156202

In [7]:
clf.score(X_test, label_test)

0.5821428571428572

In [8]:
test_preds = clf.predict(X_test)
list(zip(text_test, test_preds, label_test))[:3]

[('i thought they wanted less unwed mommas and less abortion . #confused #tcot #feminism #semst',
  'AGAINST',
  'AGAINST'),
 ('prayers for babies urgent prayer one in lexington ky & two in dallas tx & in chattanooga tn life begins at conception #semst',
  'AGAINST',
  'AGAINST'),
 ('watch out for censorship in your news and media ! #politicalrevolution #immigration #rednationrising #constitution #corruptmedia #semst',
  'NONE',
  'NONE'),
 ("pro-choice =p ro-life . anti-choice = anti-life . don't play righteous when you're advocating the endangerment of women . #womensrights #semst",
  'FAVOR',
  'FAVOR'),
 ("yes . antis just don't make sense . #waronwomen #semst", 'NONE', 'NONE'),
 ('i hope you all either enjoy the rugby or enjoy not enjoying the rugby #semst',
  'NONE',
  'AGAINST'),
 ("i'm good like this . i've always been #semst", 'NONE', 'NONE'),
 ('mathewsjeanne thank you your very #kind rt . god bless you #patriot . #babies matter #semst',
  'AGAINST',
  'AGAINST'),
 ('yes , an

In [14]:
f1_score(label_test, test_preds, average="macro", labels=['AGAINST','FAVOR','NONE'])

# array([0.67109635, 0.47706422, 0.48      ])
# Micro 0.5821428571428572 Macro 0.5427201885661455

0.5427201885661455

## With fasttext embeddings

In [241]:
clf_lr_fasttext = LogisticRegression(penalty='l1', class_weight='balanced', C=1.0, n_jobs=-1)
clf_lr_fasttext.fit(train_embeds, label_train)
clf_lr_fasttext.score(train_embeds, label_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.557427258805513

In [242]:
clf_lr_fasttext.score(test_embeds, label_test)

0.6928571428571428

In [243]:
test_preds_fasttext = clf_lr_fasttext.predict(test_embeds)
list(zip(text_test, test_preds_fasttext, label_test))[:3]

[("well said . men are playing political football with women's right to have control over our bodies . #fail #semst",
  'AGAINST',
  'FAVOR'),
 ('oh you dont like that im pro-choice . pftt . dont try to force me to be pro-life . fuck off #sorrynotsorry #semst',
  'AGAINST',
  'FAVOR'),
 ('just wanted to say that i appreciate the pro life stance . a lot of celebrities are afraid to show a bold stance #lfo #semst',
  'AGAINST',
  'AGAINST')]

In [249]:
f1_score(label_test, test_preds_fasttext, average='macro', labels=['AGAINST','FAVOR','NONE'])

# array([0.81318681, 0.04255319, 0.27586207])
# Micro 0.6928571428571428 Macro 0.3772006912138974

#It's not predicting AGAINST to everything but it has a clear favouritism for majoritay class

0.3772006912138974

In [None]:
# Next step: check how well they perform accounting only the predictions with most confidence