# Imports

In [31]:
import pandas as pd
import fasttext

from nltk.tokenize import TweetTokenizer
import re
import unidecode

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords

import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from scipy import sparse

import random

from sklearn import svm

# Collect Data

I couldn't import the data as directly downloaded from semeval because it gave me an "unknown not utf-8 character" so I imported it as a csv to an Excel and saved it again as train_utf.csv and it worked

In [32]:
traindata = pd.read_csv("semeval.abortion.train.csv", sep=',', encoding="latin1").fillna(method="ffill")
testdata = pd.read_csv("semeval_test_corrected.csv", sep=',', encoding="latin1").fillna(method="ffill")
testdata_wrong = pd.read_csv("semeval.abortion.test.csv", sep=',', encoding="latin1").fillna(method="ffill")
validationdata = pd.read_csv("semeval.abortion.validation.csv", sep=',', encoding="latin1").fillna(method="ffill")

is_against = traindata['Stance']=='AGAINST'
is_favor = traindata['Stance']=='FAVOR'
is_none = traindata['Stance']=='NONE'

traindata_against = traindata[is_against]
traindata_favor = traindata[is_favor]
traindata_none = traindata[is_none]

Subsampling to equilibrate the classes

In [33]:
seed = 43
# traindata_balanced = pd.concat([traindata_against.sample(n=99, random_state=seed), traindata_none.sample(n=99, random_state=seed), traindata_favor.sample(frac=1.0, random_state=seed)])

# Preprocessing

In [34]:
# TweetTokenizer basically unterstands arrows, smiley faces and weird punctuation
tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False)


def my_preprocess(text, keep_hashtags=True):
    toks = tokenizer.tokenize(text)

    ret = []
    for tok in toks:
#         if tok[:4] == "#sem":
#             continue
        if tok[0] == "#" and not keep_hashtags:
            continue
        if tok[:4] == "http":
            continue
        if tok[0] == "@":
            continue
        # removing numbers
#         if tok.isnumeric():
#             continue
        ret.append(tok)
    return " ".join(ret)

In [35]:
## Train
traindata['Text'] = traindata['Tweet'].apply(lambda x: my_preprocess(x))

# Shifts the order on the original tweet list
sample_train = traindata.sample(frac=1.0)

text_train, label_train = sample_train['Text'], sample_train['Stance']


# # ## Train Balanced (with sub-sampling)
# traindata_balanced['Text'] = traindata_balanced['Tweet'].apply(lambda x: my_preprocess(x))

# # Shifts the order on the original tweet list
# sample_train_balanced = traindata_balanced.sample(frac=1.0)

# text_train_balanced, label_train_balanced = sample_train_balanced['Text'], sample_train_balanced['Stance']


## Validation
validationdata['Text'] = validationdata['Tweet'].apply(lambda x: my_preprocess(x))

# Shifts the order on the original tweet list
sample_valid = validationdata.sample(frac=1.0)

text_valid, label_valid = sample_valid['Text'], sample_valid['Stance']



## Test
testdata['Text'] = testdata['Tweet'].apply(lambda x: my_preprocess(x))

sample_test = testdata.sample(frac=1.0)

text_test, label_test = sample_test['Text'], sample_test['MyLabeling']


## Test with original badly anotated dataset

testdata_wrong['Text'] = testdata_wrong['Tweet'].apply(lambda x: my_preprocess(x))

sample_test_wrong = testdata_wrong.sample(frac=1.0)

text_test_wrong, label_test_wrong = sample_test_wrong['Text'], sample_test_wrong['Stance']

# print(list(zip(text_test, label_test)))

# Choose Train and Test

In [36]:
# Add or remove a '_balanced' after the text_train and label_train
texttrain = text_train
labeltrain = label_train

In [87]:
import re
def camel_case_split(identifier):
    if identifier == "#SemST":
        return []
    if identifier[0] == '#':
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier[1:])
        return [m.group(0) for m in matches]
    else:
        return [identifier]

# add '_wrong' or not after the variables
texttest, labeltest = [" ".join([" ".join(camel_case_split(word)) for word in text.split(" ")]) for text in text_test_wrong], label_test_wrong
textvalid, labelvalid = [" ".join([" ".join(camel_case_split(word)) for word in text.split(" ")]) for text in text_valid], label_valid

# Representation of tweet

## Vectorize

In [38]:

vectorizer = CountVectorizer(
    binary=True, min_df=0.0075, max_df=0.75, ngram_range=(1, 5),
    #stop_words=stopwords.words('spanish')
)

X = vectorizer.fit_transform([*texttrain, *textvalid, *texttest])
# VEC_train = vectorizer.fit_transform([*texttrain])
# VEC_valid = vectorizer.transform([*textvalid])
# VEC_test = vectorizer.transform([*texttest])

# print(X.shape)

VEC_train = X[:len(texttrain)]
VEC_valid = X[len(texttrain):len(texttrain) + len(textvalid)]
VEC_test = X[len(texttrain) + len(textvalid):len(texttrain) + len(textvalid) + len(texttest)]

In [39]:
VEC_train.shape

(522, 426)

## Fasttext embedding

In [None]:
model_vectors = fasttext.load_model("fasttext-embeddings-abortion.bin")

# SVM's

In [52]:
svc = svm.SVC(probability=True, kernel='polynomial', gamma='scale')
svc.fit(VEC_train, labeltrain)

ValueError: 'polynomial' is not in list

In [42]:
svc.score(VEC_train, labeltrain)

0.9846743295019157

### Test

In [43]:

svc.score(VEC_test, labeltest)

predictions = []
for vec_test, label, tweet in zip(VEC_test, labeltest, texttest):
    predictions.append(svc.predict(vec_test))
    
f1_score(labeltest, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"])
accuracy_score(labeltest, predictions)

0.6071428571428571

In [44]:
predictions = []
for vec_test, label, tweet in zip(VEC_test, labeltest, texttest):
    predictions.append(svc.predict(vec_test))
#     print(label)
#     print(tweet)

In [48]:
f1_score(labeltest, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"])

array([0.65217391, 0.52554745, 0.61139896])

In [49]:
accuracy_score(labeltest, predictions)

0.6071428571428571

In [18]:
def highest_prob(probs):
    highest = 0
    for i in range(3):
        if probs[i] > highest:
            highest = probs[i]
    return highest

def count_above_threshold(probs, threshold):
    count = 0
    for i in range(3):
        if probs[i] > threshold:
            count += 1
    return count

In [19]:
predictions_probs = []
tweets_selected = []
label_valid_selection = []
predictions = []
for vec_test, label, tweet in zip(VEC_test, labeltest, texttest):
    predictions_probs.append(svc.predict_proba(vec_test)[0])
    tweets_selected.append(tweet)
    label_valid_selection.append(label)
    predictions.append(svc.predict(vec_test)[0])
tweet_truth_prob_pred = zip(tweets_selected, label_valid_selection, predictions_probs, predictions)
tweet_truth_prob_pred = sorted(tweet_truth_prob_pred, key=lambda x: -highest_prob(x[2]) if(count_above_threshold(x[2], 0.25) == 1) else 1)

predictions_sorted = []
truth_sorted = []
for tweet, truth, probs, pred in tweet_truth_prob_pred:
    predictions_sorted.append(pred)
    truth_sorted.append(truth)

f1_score(truth_sorted[:10], predictions_sorted[:10], average="micro", labels=["AGAINST", "FAVOR", "NONE"])

#     if highest_prob(probs[0]) > 0.8 and count_above_threshold(probs[0], 0.3) == 1:
#         print(tweet)
#         print(svc.predict(vec_test)[0])
#         print(label)
#         print(svc.predict_proba(vec_test))
#     print(label)
#     print(tweet)

0.9

# Retrain

In [20]:
# Initial value
enhanced_text_train = list(texttrain)
enhanced_label_train = list(labeltrain)

# Predict on unlabeled data

In [22]:
corpus_number = 10

In [23]:
corpus_number = 10

f = open("random_partitions_of_unlabeled_corpus/abortion_unlabeled_sample_{}.csv".format(corpus_number), 'r')
predictions = []
predictions_probs = []
tweets = []
for idx, line in enumerate(f):
    tweet = line.replace('\n','')
    tweets.append(tweet)
    Y = vectorizer.transform([tweet])
    predictions.append(svc.predict(Y)[0])
    predictions_probs.append(svc.predict_proba(Y)[0])

tweet_prob_pred = zip(tweets, predictions_probs, predictions)
tweet_prob_pred = sorted(tweet_prob_pred, key=lambda x: -highest_prob(x[1]) if(count_above_threshold(x[1], 0.25) == 1) else 1)

#     prediction = svc.predict(Y)[0]
#     tweets.append(tweet)
#     predictions.append(prediction)
    
# tweet_pred = zip(tweets, predictions)
# tweet_pred = sorted(tweet_pred, key=lambda p: -p[1][1][0])

tweet_prob_pred[:20]

[("Aren ’ t politicians meant to be intelligent ? Politicians debate changing ' pregnant women ' to ' pregnant people '",
  array([0.03379076, 0.94322589, 0.02298335]),
  'FAVOR'),
 ('I support Pro Life . Abortion is the murder of an innocent human',
  array([0.92886476, 0.06179444, 0.0093408 ]),
  'AGAINST'),
 ('Really ? False ? Are you genuinely unaware that abortion is the murder of a baby ?',
  array([0.91668567, 0.04754896, 0.03576537]),
  'AGAINST'),
 ('ATP : Abortion : GOP Virginia Candidate – Ankle Bracelets for Pregnant Women abortion Virginia prochoice',
  array([0.06152358, 0.91428169, 0.02419473]),
  'FAVOR'),
 ('Wow , Virginia is about to enter the 21st century . Hold on to your tricorner hats .',
  array([0.06933862, 0.02262946, 0.90803192]),
  'NONE'),
 ('This is a bad judge .',
  array([0.05758508, 0.03451112, 0.9079038 ]),
  'NONE'),
 ('This is Democrats', array([0.05758508, 0.03451112, 0.9079038 ]), 'NONE'),
 ('This is fucking disgusting .',
  array([0.05758508, 0.034

# Retrain Checkpoint

## Balance the predicted data

In [101]:
ag_count = 0
fv_count = 0
none_count = 0
pred_limit = 20
against_limit = 50 * 0.5
favor_limit = 26 * 0.5
none_limit = 24 * 0.5
enhanced_text = []
enhanced_label = []
for tweet, prob, pred in tweet_prob_pred:
    if pred == "AGAINST" and ag_count < against_limit:
#         train_ft.append("{} - {} - {}".format(tweet, pred, prob))
        enhanced_text.append(tweet)
        enhanced_label.append(pred)
        ag_count += 1
    if pred == "FAVOR" and fv_count < favor_limit:
#         train_ft.append("{} - {} - {}".format(tweet, pred, prob))
        enhanced_text.append(tweet)
        enhanced_label.append(pred)
        fv_count += 1
    if pred == "NONE" and none_count < none_limit:
#         train_ft.append("{} - {} - {}".format(tweet, pred, prob))
        enhanced_text.append(tweet)
        enhanced_label.append(pred)
        none_count += 1


In [102]:
enhanced_text_train = enhanced_text_train + enhanced_text
enhanced_label_train = enhanced_label_train + enhanced_label
len(enhanced_text_train)

922

In [103]:

vectorizer = CountVectorizer(
    binary=True, min_df=0.0075, max_df=0.75, ngram_range=(1, 5),
    #stop_words=stopwords.words('spanish')
)

X_enhanced = vectorizer.fit_transform([*enhanced_text_train, *textvalid, *texttest])
# VEC_train = vectorizer.fit_transform([*texttrain])
# VEC_valid = vectorizer.transform([*textvalid])
# VEC_test = vectorizer.transform([*texttest])

# print(X.shape)

VEC_train_enhanced = X_enhanced[:len(enhanced_text_train)]
VEC_valid_enhanced = X_enhanced[len(enhanced_text_train):len(enhanced_text_train) + len(textvalid)]
VEC_test_enhanced = X_enhanced[len(enhanced_text_train) + len(textvalid):len(enhanced_text_train) + len(textvalid) + len(texttest)]

In [104]:
svc_enhanced = svc = svm.SVC(probability=True, kernel='rbf')#, gamma='scale')
svc_enhanced.fit(VEC_train_enhanced, enhanced_label_train)
svc_enhanced.score(VEC_train_enhanced, enhanced_label_train)

0.9620390455531453

# Test

In [105]:
svc_enhanced.score(VEC_test_enhanced, labeltest)

0.6535714285714286

In [106]:
svc_enhanced.score(VEC_test_enhanced, labeltest)
predictions = []
labels = []
for vec_test, label in zip(VEC_test_enhanced, labeltest):
    predictions.append(svc_enhanced.predict(vec_test)[0])
    labels.append(label)

In [107]:
list(zip(predictions, labels))

[('AGAINST', 'NONE'),
 ('NONE', 'NONE'),
 ('FAVOR', 'NONE'),
 ('NONE', 'FAVOR'),
 ('NONE', 'NONE'),
 ('NONE', 'NONE'),
 ('AGAINST', 'NONE'),
 ('AGAINST', 'FAVOR'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'AGAINST'),
 ('NONE', 'FAVOR'),
 ('NONE', 'NONE'),
 ('AGAINST', 'NONE'),
 ('AGAINST', 'AGAINST'),
 ('NONE', 'NONE'),
 ('AGAINST', 'AGAINST'),
 ('NONE', 'FAVOR'),
 ('FAVOR', 'FAVOR'),
 ('AGAINST', 'NONE'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'NONE'),
 ('AGAINST', 'FAVOR'),
 ('NONE', 'FAVOR'),
 ('AGAINST', 'NONE'),
 ('FAVOR', 'NONE'),
 ('NONE', 'NONE'),
 ('FAVOR', 'FAVOR'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'NONE'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'FAVOR'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'FAVOR'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'AGAINST'),
 ('AGAINST', 'FAVOR'),
 ('AGAINST', 'FAVOR'),
 ('FAVOR', 'FAVOR'),
 ('NONE', 'NONE'),
 ('AGAINST', 'AGA

In [108]:
f1_score(labels, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"])
accuracy_score(labeltest, predictions)

array([0.6953125 , 0.55714286, 0.67073171])

In [109]:
accuracy_score(labeltest, predictions)

0.6535714285714286

# Predict on unlabeled data

In [110]:
corpus_number += 1
corpus_number

18

In [111]:
f = open("random_partitions_of_unlabeled_corpus/abortion_unlabeled_sample_{}.csv".format(corpus_number), 'r')
predictions = []
predictions_probs = []
tweets = []
for idx, line in enumerate(f):
    tweet = line.replace('\n','')
    tweets.append(tweet)
    Y = vectorizer.transform([tweet])
    predictions.append(svc_enhanced.predict(Y)[0])
    predictions_probs.append(svc_enhanced.predict_proba(Y)[0])

tweet_prob_pred = zip(tweets, predictions_probs, predictions)
tweet_prob_pred = sorted(tweet_prob_pred, key=lambda x: -highest_prob(x[1]) if(count_above_threshold(x[1], 0.25) == 1) else 1)

tweet_prob_pred[:20]

[('" Women Denied Abortion May Face Long-Lasting Health Problems " - Study',
  array([0.00198839, 0.99307007, 0.00494154]),
  'FAVOR'),
 ("Please support women's rights to reproductive health and access to services .",
  array([0.00294184, 0.98778602, 0.00927214]),
  'FAVOR'),
 ('Factors influencing decision-making power regarding reproductive health and rights among married women in Mettu rural district , south-west , Ethiopia :',
  array([0.00263875, 0.98769769, 0.00966356]),
  'FAVOR'),
 ('Nairobi summit on ICPD 25 to push for strong commitments to sexual and reproductive health and rights -',
  array([0.00401114, 0.98442427, 0.01156459]),
  'FAVOR'),
 ('Who Speaks For The Child Abortion Is Murder Heartbeat Is Life',
  array([0.97631116, 0.01566921, 0.00801963]),
  'AGAINST'),
 ('Pray The Rosary . First Friday . Ave Maria . Blessed Virgin Mary . Our Lady . Hail Holy Queen . Sacred Heart Of Jesus . Catholic . Ave Maria Gratia Plena . Catholics . Pray To End Abortion . Pro Life . Cath

In [None]:
# Hiperparametros a testear:
#     Top K parameters: fijo en 200
#     Kernel functions
#     C
#     3 iteraciones de bootstrap. Se puede testear hasta 10 en dev
#     Agregar según la distribución del training o segun los primeros q aparezcan

# Alltogether

In [50]:
def highest_prob(probs):
    highest = 0
    for i in range(3):
        if probs[i] > highest:
            highest = probs[i]
    return highest

def count_above_threshold(probs, threshold):
    count = 0
    for i in range(3):
        if probs[i] > threshold:
            count += 1
    return count

In [88]:

kernel = 'rbf'

random.seed(43)

corpus_number = random.randrange(0,20,1)

print("initial corpus: {}".format(corpus_number))

pred_limit = 200
against_limit = 50 * 2
favor_limit = 26 * 2
none_limit = 24 * 2

vectorizer = CountVectorizer(
    binary=True, min_df=0.0075, max_df=0.75, ngram_range=(1, 5),
    #stop_words=stopwords.words('spanish')
)

X = vectorizer.fit_transform([*texttrain, *textvalid, *texttest])

VEC_train = X[:len(texttrain)]
VEC_valid = X[len(texttrain):len(texttrain) + len(textvalid)]
VEC_test = X[len(texttrain) + len(textvalid):len(texttrain) + len(textvalid) + len(texttest)]


svc = svm.SVC(probability=True, kernel=kernel, gamma='scale')#, class_weight='balanced')
svc.fit(VEC_train, labeltrain)

# svc.score(VEC_train, labeltrain)
# svc.score(VEC_test, labeltest)

predictions = []
for vec_test, label, tweet in zip(VEC_test, labeltest, texttest):
    predictions.append(svc.predict(vec_test))

print("Metrics on test on initial classifier")
print(f1_score(labeltest, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"]))
print("macro: {}".format(f1_score(labeltest, predictions, average="macro", labels=["AGAINST", "FAVOR", "NONE"])))
print("micro: {}".format(f1_score(labeltest, predictions, average="micro", labels=["AGAINST", "FAVOR", "NONE"])))
print("precision")
print(precision_score(labeltest, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"]))
print("macro: {}".format(precision_score(labeltest, predictions, average="macro", labels=["AGAINST", "FAVOR", "NONE"])))
print("micro: {}".format(precision_score(labeltest, predictions, average="micro", labels=["AGAINST", "FAVOR", "NONE"])))
print("recall")
print(recall_score(labeltest, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"]))
print("macro: {}".format(recall_score(labeltest, predictions, average="macro", labels=["AGAINST", "FAVOR", "NONE"])))
print("micro: {}".format(recall_score(labeltest, predictions, average="micro", labels=["AGAINST", "FAVOR", "NONE"])))

print("accuracy: {}".format(accuracy_score(labeltest, predictions)))

# Initial value
enhanced_text_train = list(texttrain)
enhanced_label_train = list(labeltrain)

f1_av = [0] * 5
f1_against = [0] * 5
f1_favor = [0] * 5
macro = [0] * 5
micro = [0] * 5
prec_against = [0] * 5
prec_favor = [0] * 5
prec_av = [0] * 5
prec_macro = [0] * 5
rec_against = [0] * 5
rec_favor = [0] * 5
rec_av = [0] * 5
rec_macro = [0] * 5
acc = [0] * 5
for j in range(5):
    random.seed(43 + j)
    corpus_number = random.randrange(0,20,1)
    f = open("random_partitions_of_unlabeled_corpus/abortion_unlabeled_sample_{}.csv".format(corpus_number), 'r')
    predictions = []
    predictions_probs = []
    tweets = []
    for idx, line in enumerate(f):
        tweet = line.replace('\n','')
        tweets.append(tweet)
        Y = vectorizer.transform([tweet])
        predictions.append(svc.predict(Y)[0])
        predictions_probs.append(svc.predict_proba(Y)[0])

    tweet_prob_pred = zip(tweets, predictions_probs, predictions)
    tweet_prob_pred = sorted(tweet_prob_pred, key=lambda x: -highest_prob(x[1]) if(count_above_threshold(x[1], 0.25) == 1) else 1)

    # Initial value
    enhanced_text_train = list(texttrain)
    enhanced_label_train = list(labeltrain)
    print("Training set original size: {}".format(len(enhanced_text_train)))


    for i in range(5):

        # Balance the predicted data
        ag_count = 0
        fv_count = 0
        none_count = 0
        enhanced_text = []
        enhanced_label = []
        for tweet, prob, pred in tweet_prob_pred:
#             enhanced_text.append(tweet)
#             enhanced_label.append(pred)
            if pred == "AGAINST" and ag_count < against_limit:
        #         train_ft.append("{} - {} - {}".format(tweet, pred, prob))
                enhanced_text.append(tweet)
                enhanced_label.append(pred)
                ag_count += 1
            if pred == "FAVOR" and fv_count < favor_limit:
        #         train_ft.append("{} - {} - {}".format(tweet, pred, prob))
                enhanced_text.append(tweet)
                enhanced_label.append(pred)
                fv_count += 1
            if pred == "NONE" and none_count < none_limit:
        #         train_ft.append("{} - {} - {}".format(tweet, pred, prob))
                enhanced_text.append(tweet)
                enhanced_label.append(pred)
                none_count += 1

        # Expand the training set
        enhanced_text_train = enhanced_text_train + enhanced_text
        enhanced_label_train = enhanced_label_train + enhanced_label
        print("Training set new size: {}".format(len(enhanced_text_train)))


        vectorizer_enhanced = CountVectorizer(
            binary=True, min_df=0.0075, max_df=0.75, ngram_range=(1, 5),
        )

        X_enhanced = vectorizer_enhanced.fit_transform([*enhanced_text_train, *textvalid, *texttest])
        # VEC_train = vectorizer.fit_transform([*texttrain])
        # VEC_valid = vectorizer.transform([*textvalid])
        # VEC_test = vectorizer.transform([*texttest])

        # print(X.shape)

        VEC_train_enhanced = X_enhanced[:len(enhanced_text_train)]
        VEC_valid_enhanced = X_enhanced[len(enhanced_text_train):len(enhanced_text_train) + len(textvalid)]
        VEC_test_enhanced = X_enhanced[len(enhanced_text_train) + len(textvalid):len(enhanced_text_train) + len(textvalid) + len(texttest)]

        svc_enhanced = svm.SVC(probability=True, kernel=kernel, gamma='scale')#, class_weight='balanced')
        svc_enhanced.fit(VEC_train_enhanced, enhanced_label_train)
        svc_enhanced.score(VEC_train_enhanced, enhanced_label_train)

        svc_enhanced.score(VEC_test_enhanced, labeltest)
        predictions = []
        labels = []
        for vec_test, label in zip(VEC_test_enhanced, labeltest):
            predictions.append(svc_enhanced.predict(vec_test)[0])
            labels.append(label)

    #     print("Iteration {}".format(i))
        f1 = f1_score(labels, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"])
        f1_av[i] += (f1[0] + f1[1]) / 2
        f1_against[i] += f1[0]
        f1_favor[i] += f1[1]
        macro[i] += f1_score(labels, predictions, average="macro", labels=["AGAINST", "FAVOR", "NONE"])
        micro[i] += f1_score(labels, predictions, average="micro", labels=["AGAINST", "FAVOR", "NONE"])
        acc[i] += accuracy_score(labeltest, predictions)
        prec = precision_score(labels, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"])
        prec_against[i] += prec[0]
        prec_favor[i] += prec[1]
        prec_av[i] += (prec[0] + prec[1]) / 2
        prec_macro[i] += precision_score(labels, predictions, average="macro", labels=["AGAINST", "FAVOR", "NONE"])
        rec = recall_score(labels, predictions, average=None, labels=["AGAINST", "FAVOR", "NONE"])
        rec_against[i] += rec[0]
        rec_favor[i] += rec[1]
        rec_av[i] += (rec[0] + rec[1]) / 2
        rec_macro[i] += recall_score(labels, predictions, average="macro", labels=["AGAINST", "FAVOR", "NONE"])

        corpus_number = (corpus_number + 1) % 20
        f = open("random_partitions_of_unlabeled_corpus/abortion_unlabeled_sample_{}.csv".format(corpus_number), 'r')
        predictions = []
        predictions_probs = []
        tweets = []
        for idx, line in enumerate(f):
            tweet = line.replace('\n','')
            tweets.append(tweet)
            Y = vectorizer_enhanced.transform([tweet])
            predictions.append(svc_enhanced.predict(Y)[0])
            predictions_probs.append(svc_enhanced.predict_proba(Y)[0])

        tweet_prob_pred = zip(tweets, predictions_probs, predictions)
        tweet_prob_pred = sorted(tweet_prob_pred, key=lambda x: -highest_prob(x[1]) if(count_above_threshold(x[1], 0.25) == 1) else 1)

for i in range(5):
    print("{},{},{},{},{},{},{},{},{},{},{},{},{},{}".format(macro[i] / 5, f1_av[i] / 5, micro[i] / 5, f1_against[i] / 5, f1_favor[i] / 5, acc[i] / 5, prec_against[i] / 5, prec_favor[i] / 5, prec_av[i] / 5, prec_macro[i] / 5, rec_against[i] / 5, rec_favor[i] / 5, rec_av[i] / 5, rec_macro[i] / 5))

#     tweet_prob_pred[:20]

initial corpus: 1
Metrics on test on initial classifier
[0.75718016 0.33333333 0.43243243]
macro: 0.5076486408079098
micro: 0.6428571428571429
precision
[0.74742268 0.55       0.36363636]
macro: 0.553686348016245
micro: 0.6428571428571429
recall
[0.76719577 0.23913043 0.53333333]
macro: 0.5132198451039031
micro: 0.6428571428571429
accuracy: 0.6428571428571429
Training set original size: 522
Training set new size: 698
Training set new size: 898
Training set new size: 1098
Training set new size: 1298
Training set new size: 1498
Training set original size: 522
Training set new size: 695
Training set new size: 895
Training set new size: 1095
Training set new size: 1295
Training set new size: 1495
Training set original size: 522
Training set new size: 699
Training set new size: 899
Training set new size: 1099
Training set new size: 1299
Training set new size: 1499
Training set original size: 522
Training set new size: 696
Training set new size: 896
Training set new size: 1096
Training set n

In [None]:
 # Get precision and recall metrics