# Imports

In [203]:
import pandas as pd
import fasttext

from nltk.tokenize import TweetTokenizer
import re
import unidecode

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords

import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score, recall_score, f1_score

from scipy import sparse

# Collect Data

I couldn't import the data as directly downloaded from semeval because it gave me an "unknown not utf-8 character" so I imported it as a csv to an Excel and saved it again as train_utf.csv and it worked

In [20]:
traindata = pd.read_csv("train_utf.csv", sep=',', encoding="latin1").fillna(method="ffill")
testdata = pd.read_csv("test_utf.csv", sep=',', encoding="latin1").fillna(method="ffill")

## Keep only the abortion tweets
is_abortion = traindata['Target']=="Legalization of Abortion"
is_abortion_test = testdata['Target']=="Legalization of Abortion"
train_abortion = traindata[is_abortion]
test_abortion = testdata[is_abortion_test]

# Preprocessing

In [21]:

# TweetTokenizer basically unterstands arrows, smiley faces and weird punctuation
tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False)


def my_preprocess(text, keep_hashtags=True):
    toks = tokenizer.tokenize(text)

    ret = []
    for tok in toks:
        if tok[0] == "#" and not keep_hashtags:
            continue
        if tok[:4] == "http":
            continue
        if tok[0] == "@":
            continue
        # removing numbers
        if tok.isnumeric():
            continue
        ret.append(unidecode.unidecode(tok.lower()))
    return " ".join(ret)

In [22]:

## Train
train_abortion['Text'] = train_abortion['Tweet'].apply(lambda x: my_preprocess(x))

# Shifts the order on the original tweet list
sample_train = train_abortion.sample(frac=1.0)

text_train, label_train = sample_train['Text'], sample_train['Stance']

## Test
test_abortion['Text'] = test_abortion['Tweet'].apply(lambda x: my_preprocess(x))

sample_test = test_abortion.sample(frac=1.0)

text_test, label_test = sample_test['Text'], sample_test['Stance']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


# Vectorize

Using a word count Matrix

In [176]:

vectorizer = CountVectorizer(
    binary=True, min_df=0.002, max_df=0.55, ngram_range=(1, 4),
    #stop_words=stopwords.words('spanish')
)

X = vectorizer.fit_transform(pd.concat([text_train,text_test]))
X_train = X[:653]
X_test = X[653:]

<1x3659 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

# Fasttext

### Create a csv file with the label format that fasttext needs

In [24]:
with open("fasttext_train.txt", 'w') as w:
    for (text, label) in zip(text_train, label_train):
        w.write("{} __label__{}\n".format(text, label))
w.close()

### Train

In [160]:
# Find the best number of epochs in order not to overfit
best_epoch = 0
best_f1 = 0
for epoch in [410, 420,430,440,450,460,470,480,490,500]:    
    model = fasttext.train_supervised('fasttext_train.txt', lr=0.01, dim=300, epoch=epoch)
    
    # Evaluate
    predictions = []
    for tweet in text_test:
        predictions.append(model.predict(tweet))
    pred_labels = [pred[0][0].replace('__label__','') for pred in predictions]
    current_f1 = f1_score(label_test, pred_labels, average="macro", labels=['AGAINST','FAVOR','NONE'])
    if current_f1 > best_f1:
        best_epoch = epoch
        best_f1 = current_f1

In [161]:
model = fasttext.train_supervised('fasttext_train.txt', lr=0.01, dim=300, epoch=best_epoch)
best_epoch

490

### Test

In [162]:
predictions = []
for tweet in text_test:
    predictions.append(model.predict(tweet))

pred_labels = [pred[0][0].replace('__label__','') for pred in predictions]

In [166]:
f1_score(label_test, pred_labels, average=None, labels=['AGAINST','FAVOR','NONE'])

# With 490 epochs an lr = 0.01, best result so far:
# micro: 0.6071428571428571, macro: 0.5256226053639846, per_class=[0.7183908 , 0.47916667, 0.37931034]

array([0.7183908 , 0.47916667, 0.37931034])

## Fasttext Embeddings

###  Generate training file as needed by the fasttext lib

In [167]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["abortion"]

### Filter retweets

In [170]:
w = open("fasttext_train_unsupervised.txt", 'w')
unique_tweets = set()
for t in db.abortion.find({},{"tweet.full_text"}):
    unique_tweets.add(t['tweet']['full_text'])

for text in unique_tweets:
    w.write("{}\n".format(text))

for text in text_train:
    w.write("{}\n".format(text))
    
w.close()

In [171]:
len(unique_tweets)

154726

In [231]:
model_vectors = fasttext.train_supervised('fasttext_train_unsupervised.txt', lr=0.01, dim=300, epoch=500)

In [239]:
train_embeds = [[0] * 300] * len(text_train)
for idx, tweet in enumerate(text_train):
    for word in tweet:
        train_embeds[idx] += model_vectors[word]


In [240]:
test_embeds = [[0] * 300] * len(text_test)
for idx, tweet in enumerate(text_test):
    for word in tweet:
        test_embeds[idx] += model_vectors[word]


<1x3659 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

# Logistic Regression

## With word count matrix

In [6]:
clf = LogisticRegression(penalty='l1', class_weight='balanced', C=1.0, n_jobs=-1)

clf.fit(X_train, label_train)

clf.score(X_train, label_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.9127105666156202

In [7]:
clf.score(X_test, label_test)

0.5821428571428572

In [8]:
test_preds = clf.predict(X_test)
list(zip(text_test, test_preds, label_test))[:3]

[('i thought they wanted less unwed mommas and less abortion . #confused #tcot #feminism #semst',
  'AGAINST',
  'AGAINST'),
 ('prayers for babies urgent prayer one in lexington ky & two in dallas tx & in chattanooga tn life begins at conception #semst',
  'AGAINST',
  'AGAINST'),
 ('watch out for censorship in your news and media ! #politicalrevolution #immigration #rednationrising #constitution #corruptmedia #semst',
  'NONE',
  'NONE'),
 ("pro-choice =p ro-life . anti-choice = anti-life . don't play righteous when you're advocating the endangerment of women . #womensrights #semst",
  'FAVOR',
  'FAVOR'),
 ("yes . antis just don't make sense . #waronwomen #semst", 'NONE', 'NONE'),
 ('i hope you all either enjoy the rugby or enjoy not enjoying the rugby #semst',
  'NONE',
  'AGAINST'),
 ("i'm good like this . i've always been #semst", 'NONE', 'NONE'),
 ('mathewsjeanne thank you your very #kind rt . god bless you #patriot . #babies matter #semst',
  'AGAINST',
  'AGAINST'),
 ('yes , an

In [14]:
f1_score(label_test, test_preds, average="macro", labels=['AGAINST','FAVOR','NONE'])

# array([0.67109635, 0.47706422, 0.48      ])
# Micro 0.5821428571428572 Macro 0.5427201885661455

0.5427201885661455

## With fasttext embeddings

In [241]:
clf_lr_fasttext = LogisticRegression(penalty='l1', class_weight='balanced', C=1.0, n_jobs=-1)
clf_lr_fasttext.fit(train_embeds, label_train)
clf_lr_fasttext.score(train_embeds, label_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.557427258805513

In [242]:
clf_lr_fasttext.score(test_embeds, label_test)

0.6928571428571428

In [243]:
test_preds_fasttext = clf_lr_fasttext.predict(test_embeds)
list(zip(text_test, test_preds_fasttext, label_test))[:3]

[("well said . men are playing political football with women's right to have control over our bodies . #fail #semst",
  'AGAINST',
  'FAVOR'),
 ('oh you dont like that im pro-choice . pftt . dont try to force me to be pro-life . fuck off #sorrynotsorry #semst',
  'AGAINST',
  'FAVOR'),
 ('just wanted to say that i appreciate the pro life stance . a lot of celebrities are afraid to show a bold stance #lfo #semst',
  'AGAINST',
  'AGAINST')]

In [249]:
f1_score(label_test, test_preds_fasttext, average='macro', labels=['AGAINST','FAVOR','NONE'])

# array([0.81318681, 0.04255319, 0.27586207])
# Micro 0.6928571428571428 Macro 0.3772006912138974

#It's not predicting AGAINST to everything but it has a clear favouritism for majoritay class

0.3772006912138974

In [None]:
# Next step: check how well they perform accounting only the predictions with most confidence