# Imports

In [1]:
import pandas as pd
import fasttext

from nltk.tokenize import TweetTokenizer
import re
import unidecode

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords

import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score, recall_score, f1_score

from scipy import sparse

# Collect Data

I couldn't import the data as directly downloaded from semeval because it gave me an "unknown not utf-8 character" so I imported it as a csv to an Excel and saved it again as train_utf.csv and it worked

In [252]:
traindata = pd.read_csv("semeval.abortion.train.csv", sep=',', encoding="latin1").fillna(method="ffill")
testdata = pd.read_csv("semeval_test_corrected.csv", sep=',', encoding="latin1").fillna(method="ffill")
validationdata = pd.read_csv("semeval.abortion.validation.csv", sep=',', encoding="latin1").fillna(method="ffill")

is_against = traindata['Stance']=='AGAINST'
is_favor = traindata['Stance']=='FAVOR'
is_none = traindata['Stance']=='NONE'

traindata_against = traindata[is_against]
traindata_favor = traindata[is_favor]
traindata_none = traindata[is_none]

## Keep only the abortion tweets ONLY FOR COMPLETE DATASET
# is_abortion = traindata['Stance']=="Legalization of Abortion"
# is_abortion_test = testdata['Stance']=="Legalization of Abortion"
# is_abortion_valid = validationdata['Stance']=="Legalization of Abortion"

# train_abortion = traindata[is_abortion]
# test_abortion = testdata[is_abortion_test]
# valid_abortion = validationdata[is_abortion_test]

Subsampling to equilibrate the classes

In [253]:
traindata = pd.concat([traindata_against[:99], traindata_none[:99], traindata_favor])

In [254]:
traindata

Unnamed: 0,Tweet,Stance
0,Just laid down the law on abortion in my bioet...,AGAINST
2,"Now that there's marriage equality, can we sta...",AGAINST
3,I'll always put all my focus and energy toward...,AGAINST
4,"@BarackObama celebrates ""equality"" while 3000 ...",AGAINST
6,Now that the govt and The Supremes have saved ...,AGAINST
8,In the aim for equality we have forgotten the ...,AGAINST
12,"True equality"" allows all to be born. #SemST",AGAINST
14,Can we make sure #lovewins for babies too? Or ...,AGAINST
15,What about the equal rights of unborn humans i...,AGAINST
24,Okay so how about declaring and protecting equ...,AGAINST


# Preprocessing

In [255]:

# TweetTokenizer basically unterstands arrows, smiley faces and weird punctuation
tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False)


def my_preprocess(text, keep_hashtags=True):
    toks = tokenizer.tokenize(text)

    ret = []
    for tok in toks:
        if tok[:4] == "#sem":
            continue
        if tok[0] == "#" and not keep_hashtags:
            continue
        if tok[:4] == "http":
            continue
        if tok[0] == "@":
            continue
        # removing numbers
        if tok.isnumeric():
            continue
        ret.append(unidecode.unidecode(tok.lower()))
    return " ".join(ret)

In [256]:

## Train
traindata['Text'] = traindata['Tweet'].apply(lambda x: my_preprocess(x))

# Shifts the order on the original tweet list
sample_train = traindata.sample(frac=1.0)

text_train, label_train = sample_train['Text'], sample_train['Stance']




## Validation
validationdata['Text'] = validationdata['Tweet'].apply(lambda x: my_preprocess(x))

# Shifts the order on the original tweet list
sample_valid = validationdata.sample(frac=1.0)

text_valid, label_valid = sample_valid['Text'], sample_valid['Stance']



## Test
testdata['Text'] = testdata['Tweet'].apply(lambda x: my_preprocess(x))

sample_test = testdata.sample(frac=1.0)

text_test, label_test = sample_test['Text'], sample_test['MyLabeling']

print(list(zip(text_test, label_test)))

[('some babies die by chance , none should die by choice ! #semst', 'AGAINST'), ("#childrenscentre's in the #uk have stay & play sessions for very young / single parents #childcare #parenting #semst", 'NONE'), ('the idea that a " fetus " isn\'t human is outrageous , i mean what else would it be ? a dog ? a dragon ? the unborn are human beings ! #semst', 'AGAINST'), ("don't get pregnant ! make sure you don't get pregnant if you'd just kill your child if you do ! #semst", 'AGAINST'), ('no one has 5he right to tell any person what they should do with their body #semst', 'FAVOR'), ('i am about to deck these bitches in the fucking mouth . #1a #2a #nra #cos #ccot #tgdn #pjnet #wakeupamerica #semst', 'NONE'), ("not ashamed to say it's me . i'm not in favor of enc itself as not banning enc modules . #semst", 'NONE'), ('as a man , the only body i should have a say in is my own . #repealthe8th #rally4life #semst', 'FAVOR'), ('women have outgrown the common housewife stigma long ago #semst', 'NON

# Vectorize

Using a word count Matrix

In [211]:

vectorizer = CountVectorizer(
    binary=True, min_df=0.002, max_df=0.55, ngram_range=(1, 4),
    #stop_words=stopwords.words('spanish')
)

X = vectorizer.fit_transform(pd.concat([text_train, text_valid, text_test]))
X_train = X[:297]
X_valid = X[297:428]
X_test = X[428:]

# Fasttext

### Create a csv file with the label format that fasttext needs

In [177]:
with open("fasttext_train.txt", 'w') as w:
    for (text, label) in zip(text_train, label_train):
        w.write("{} __label__{}\n".format(text, label))
w.close()

### Train

In [261]:
# Find the best number of epochs in order not to overfit
best_epoch = 0
best_f1 = 0
for epoch in [5000, 7000, 10000, 12000, 15000]:    
    model = fasttext.train_supervised('fasttext_train.txt', lr=0.001, dim=500, epoch=epoch)
    
    # Evaluate
    tweet_selected = []
    predictions = []
    label_valid_selection = []
    for tweet, label in zip(text_valid, label_valid):
        prediction = model.predict(tweet)
        if prediction[1][0] > 0.95:
            tweet_selected.append(tweet)
            predictions.append(prediction)
            label_valid_selection.append(label)
    pred_labels = [pred[0][0].replace('__label__','') for pred in predictions]
#     print(list(zip(label_valid_selection, pred_labels)))
    current_f1 = f1_score(label_valid_selection, pred_labels, average="micro", labels=['AGAINST','FAVOR','NONE'])
    if current_f1 > best_f1:
        best_epoch = epoch
        best_f1 = current_f1

In [262]:
model = fasttext.train_supervised('fasttext_train.txt', lr=0.001, dim=500, epoch=best_epoch)
best_f1

# list(zip(tweet_selected, predictions, label_valid_selection))

0.85

In [263]:
best_epoch

7000

### Test

In [264]:
predictions = []
label_test_selection = []
for tweet, label in zip(text_test, label_test):
    prediction = model.predict(tweet)
    if prediction[1] > 0.95:
        print(tweet)
        print(prediction)
        print(label)
        predictions.append(prediction[0][0].replace('__label__', ''))
        label_test_selection.append(label)

print(predictions)
print(label_test_selection)

fyi : unborn babies diagnosed with a disability in the womb can feel pain just as much as a baby without a diagnosis #theyfeelpain #semst
(('__label__AGAINST',), array([0.99789083]))
AGAINST
where is #silenceddiscourse for #divisions on #schoolchoice #taxes #doublestandard #biased #semst
(('__label__NONE',), array([0.95301467]))
NONE
if it's not a person then why do women get invitro ? to have a gold fish ? #semst
(('__label__FAVOR',), array([0.95905626]))
AGAINST
these dems ' lack of compassion for unborn babies who receive prenatal diagnoses is appalling #theyfeelpain #semst
(('__label__AGAINST',), array([0.99868721]))
AGAINST
anti choice laws are sexist because they only effect women * . #semst
(('__label__FAVOR',), array([0.98048228]))
FAVOR
anti-choice laws are entirely men controlling women #semst
(('__label__FAVOR',), array([0.98600876]))
FAVOR
so anti-choice laws are telling these women they're incapable and shouldn't make decisions that involve their own body . #semst
(('__lab

In [265]:
f1_score(label_test_selection, predictions, average="micro", labels=['AGAINST','FAVOR','NONE'])

# With 340 epochs an lr = 0.01, best result so far:
# micro: 0.5607142857142857, macro: 0.4753850446808366, per_class=[0.69411765, 0.41860465, 0.31343284]

0.92

In [35]:
for (text, true, pred) in zip(text_test, label_test, pred_labels):
    print("{}... {} vs {}".format(text, true, pred))

mathewsjeanne thank you your very #kind rt . god bless you #patriot . #babies matter #semst... AGAINST vs AGAINST
good thing i don't live in kingston woulda started a riot by now with this ' pro-life ' protest #semst... FAVOR vs AGAINST
so you've blown right past the whole " women stopped dying after rvw " thing . #prolifelie #hashtagitall #semst... AGAINST vs AGAINST
like i said i'm glad this medical procedure is around so that pregnant people stay healthy . #semst... NONE vs FAVOR
. perhaps you want to go to the fertility clinics & rescue them all . how many can you fit in your womb . #clowncar #semst... AGAINST vs AGAINST
not your uterus , not your choice--from conception on , it really is that simple ! #fem2gen #yesallwomen #semst... FAVOR vs NONE
& purely physical enjoyment with no responsibility attached #semst... NONE vs NONE
we do care about the hurt inflicted upon preborn children through #abortion . #truthtour #ottawa #semst... AGAINST vs AGAINST
leftist , progressives love 2

## Fasttext Embeddings

###  Generate training file as needed by the fasttext lib

In [30]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["abortion"]

### Filter retweets

In [23]:
# w = open("fasttext_train_unsupervised.txt", 'w')
# unique_tweets = set()
# for t in db.abortion.find({},{"tweet.full_text"}):
#     unique_tweets.add(t['tweet']['full_text'])

# for text in unique_tweets:
#     w.write("{}\n".format(text))

# for text in text_train:
#     w.write("{}\n".format(text))
    
# w.close()

In [48]:
len(unique_tweets)

154726

In [202]:
model_vectors = fasttext.load_model("fasttext-embeddings-abortion.bin")




In [220]:
train_embeds = [[0] * 300] * len(text_train)
for idx, tweet in enumerate(text_train):
    for word in tweet:
        train_embeds[idx] += model_vectors[word]


In [221]:
test_embeds = [[0] * 300] * len(text_test)
for idx, tweet in enumerate(text_test):
    for word in tweet:
        test_embeds[idx] += model_vectors[word]


# Logistic Regression

## With word count matrix

In [212]:
clf = LogisticRegression(penalty='l1', class_weight='balanced', C=1.0, n_jobs=-1)

clf.fit(X_train, label_train)

clf.score(X_train, label_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.8922558922558923

In [213]:
clf.score(X_test, label_test)

0.625

In [214]:
test_preds = clf.predict(X_test)
list(zip(text_test, test_preds, label_test))[:3]

[('actually , child-murder is far worse these days . we live in more savage times . #semst',
  'NONE',
  'AGAINST'),
 ('would you rather have women taking dangerous concoctions to induce abortions or know they are getting a safe & legal one ? #semst',
  'FAVOR',
  'FAVOR'),
 ('remember that there is no " murder " involed in #abortion . i am #procontraception for less #abortion . #semst',
  'FAVOR',
  'FAVOR')]

In [216]:
f1_score(label_test, test_preds, average="micro", labels=['AGAINST','FAVOR','NONE'])

# array([0.67109635, 0.47706422, 0.48      ])
# Micro 0.5821428571428572 Macro 0.5427201885661455

0.625

## With fasttext embeddings

In [222]:
clf_lr_fasttext = LogisticRegression(penalty='l1', class_weight='balanced', C=1.0, n_jobs=-1)
clf_lr_fasttext.fit(train_embeds, label_train)
clf_lr_fasttext.score(train_embeds, label_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.6026936026936027

In [223]:
clf_lr_fasttext.score(test_embeds, label_test)

0.39285714285714285

In [224]:
test_preds_fasttext = clf_lr_fasttext.predict(test_embeds)
list(zip(text_test, test_preds_fasttext, label_test))[:3]

[('actually , child-murder is far worse these days . we live in more savage times . #semst',
  'NONE',
  'AGAINST'),
 ('would you rather have women taking dangerous concoctions to induce abortions or know they are getting a safe & legal one ? #semst',
  'FAVOR',
  'FAVOR'),
 ('remember that there is no " murder " involed in #abortion . i am #procontraception for less #abortion . #semst',
  'AGAINST',
  'FAVOR')]

In [225]:
f1_score(label_test, test_preds_fasttext, average='macro', labels=['AGAINST','FAVOR','NONE'])

# array([0.81318681, 0.04255319, 0.27586207])
# Micro 0.6928571428571428 Macro 0.3772006912138974

#It's not predicting AGAINST to everything but it has a clear favouritism for majoritay class

0.39594580402310503

In [None]:
# Next step: check how well they perform accounting only the predictions with most confidence

# Enhance training corpus using fasttext classifier

### Predict on unlabeled corpus

In [259]:
f = open("abortion_unlabeled.csv", 'r')
w = open("enhanced_labeled_corpus_using_fasttext", 'w')
for idx, line in enumerate(f):
    tweet = line.replace('\n','')
    prediction = model.predict(tweet)
    if prediction[1] > 0.98 and tweet != "":
        w.write("{}__label__{}\n".format(tweet, prediction))

### Mix train data with predictions

In [260]:
predictions = open("enhanced_labeled_corpus_using_fasttext", 'r')
training = open("fasttext_train.txt", 'r')
w = open("tmp_train_with_predictions_fasttext.txt", 'w')

for idx, line in enumerate(training):
    w.write("{}".format(line))
    
for idx, line in enumerate(predictions):
    w.write("{}".format(line))

### Retrain new model

In [270]:
# Find the best number of epochs in order not to overfit
best_epoch_enhanced = 0
best_f1_enhanced = 0
model_enhanced = None
for epoch in [5000, 7000, 10000, 12000, 15000]:    
    model_enhanced = fasttext.train_supervised('tmp_train_with_predictions_fasttext.txt', lr=0.001, dim=500, epoch=epoch)
    
    # Evaluate
    tweet_selected = []
    predictions = []
    label_valid_selection = []
    for tweet, label in zip(text_valid, label_valid):
        prediction = model_enhanced.predict(tweet)
        tweet_selected.append(tweet)
        predictions.append(prediction)
        label_valid_selection.append(label)
    pred_labels = [pred[0][0].replace('__label__','') for pred in predictions]
#     print(list(zip(label_valid_selection, pred_labels)))
    current_f1_enhanced = f1_score(label_valid_selection, pred_labels, average="macro", labels=['AGAINST','FAVOR','NONE'])
    if current_f1_enhanced > best_f1_enhanced:
        best_epoch_enhanced = epoch
        best_f1_enhanced = current_f1_enhanced

In [271]:
best_f1_enhanced

0.5829489313360282

In [None]:
# Write doc with amount of tweets and confidence used
# Use F1 using only against and favor
# Use a random seed to get the train subsample or save the subsample test corpus and upload it
# Description on the subsumpling