In [16]:
import pandas as pd
import numpy as np
import os
import fasttext
import json
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

PATH_TO_DATASETS = "../datasets"
PATH_TO_POLEMO_CONLL = "../datasets/polemo/dataset_conll"

In [17]:
files = {
    "train": os.path.join(PATH_TO_POLEMO_CONLL,"all.sentence.train_processed.csv"),
    "dev": os.path.join(PATH_TO_POLEMO_CONLL,"all.sentence.dev_processed.csv"),
    "test": os.path.join(PATH_TO_POLEMO_CONLL,"all.sentence.test_processed.csv"),
    "annotation": os.path.join(PATH_TO_DATASETS, "sentiment_data", "political_tweets_annotations.csv")
}

In [18]:
with open(os.path.join("..", "datasets", "emojis.json"), encoding="utf-8") as f:
    emoji_mapping = json.load(f)

emoji_mapping_items = emoji_mapping.items()
def emoji2text_tweet(tweet: str) -> str:
    text = tweet
    for emoji, emoji_text in emoji_mapping_items:
        text = text.replace(emoji, f"<{emoji_text}>")
    return text

In [19]:
def remove_quotes_from_saved_file(txt_path: str):
    text = ""
    with open(txt_path, "r", encoding="utf-8") as f:
        for line in f:
            if line[0] == "\"" and line[-2] == "\"":
                line = line[1:]
                line = line[:-2] + "\n"
            text += line

    os.remove(txt_path)

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)

In [20]:
data_for_fasttext = {}
for dataset, file_path in files.items():
    df = pd.read_csv(file_path)
    df = df[['label','text']]
    df['label'] = "__label__" +  df['label']
    df['text'] = df['text'].apply(emoji2text_tweet)
    df['text'] = df['text'].apply(lambda string: string.lower())
    df['text'] = df['text'].apply(lambda string: string.replace("#",""))
    df['row'] = df['label'] + " " + df['text']
    path = os.path.join(PATH_TO_DATASETS, "sentiment_data", f"{dataset}_data.txt")
    df['row'].to_csv(path, index=False, header=False)
    remove_quotes_from_saved_file(path)
    data_for_fasttext[dataset] = {}
    data_for_fasttext[dataset]["labels"] = list(df['label'].values)
    data_for_fasttext[dataset]["texts"] = list(df['text'].values)
    data_for_fasttext[dataset]["dataframe"] = df

In [21]:
tweets_data = data_for_fasttext['annotation']
texts_train_val, texts_test, labels_train_val, labels_test = train_test_split(tweets_data['texts'], tweets_data['labels'], test_size=0.1, random_state=42)
texts_train, texts_val, labels_train, labels_val = train_test_split(texts_train_val, labels_train_val, test_size=1/9, random_state=42)

train_polemo = data_for_fasttext['train']['dataframe'][["text", "label"]]
val_polemo = data_for_fasttext['dev']['dataframe'][["text", "label"]]
test_polemo = data_for_fasttext['test']['dataframe'][["text", "label"]]

train_tweets = pd.DataFrame(data={"text": texts_train, "label": labels_train})
val_tweets = pd.DataFrame(data={"text": texts_val, "label": labels_val})
test_tweets = pd.DataFrame(data={"text": texts_test, "label": labels_test})

train = train_polemo.append(train_tweets)
val = val_polemo.append(val_tweets)
test = test_polemo.append(test_tweets)

train['row'] = train['label'] + " " + train['text']
val['row'] = val['label'] + " " + val['text']
test['row'] = test['label'] + " " + test['text']

train['row'].to_csv(os.path.join(PATH_TO_DATASETS, "sentiment_data", f"full_train_data.txt"), index=False, header=False)
val['row'].to_csv(os.path.join(PATH_TO_DATASETS, "sentiment_data", f"full_val_data.txt"), index=False, header=False)
test['row'].to_csv(os.path.join(PATH_TO_DATASETS, "sentiment_data", f"full_test_data.txt"), index=False, header=False)

In [22]:
print("Training classifier only on PolEmo training data...")
model =fasttext.train_supervised(input=os.path.join(PATH_TO_DATASETS, "sentiment_data", "train_data.txt"), wordNgrams=1, neg=5,dim=300, lr=0.005, epoch=500, loss="ns", verbose=1, label_prefix='__label__')
test_results = model.predict(data_for_fasttext["test"]["texts"])
annotation_results = model.predict(data_for_fasttext["annotation"]["texts"])

print(f"F1-score for all tweets: {f1_score(annotation_results[0],data_for_fasttext['annotation']['labels'],average='macro')}")
print(f"F1-score for test set of PolEmo: {f1_score(test_results[0],data_for_fasttext['test']['labels'],average='macro')}")
print()

print("Training classifier on PolEmo training data and 80% of political tweets")
model =fasttext.train_supervised(input=os.path.join(PATH_TO_DATASETS, "sentiment_data", "full_train_data.txt"), wordNgrams=1, neg=5,dim=300, lr=0.005, epoch=500, loss="ns", verbose=1, label_prefix='__label__')
test_results = model.predict(list(test_polemo["text"].values))
annotation_results = model.predict(list(test_tweets['text'].values))

print(f"F1-score for test set of political tweets: {f1_score(annotation_results[0],list(test_tweets['label'].values),average='macro')}")
print(f"F1-score for test set of PolEmo: {f1_score(test_results[0],list(test_polemo['label'].values),average='macro')}")
print()

Training classifier only on PolEmo training data...
F1-score for all tweets: 0.29466757589906445
F1-score for test set of PolEmo: 0.5874823306117167

Training classifier on PolEmo training data and 80% of political tweets
F1-score for test set of political tweets: 0.3812850008377031
F1-score for test set of PolEmo: 0.5610325791091721



In [None]:
dims = []
ngrams = []
losses = []
negs = []
tweets_f1_scores = []
dev_f1_scores = []
for dim in [300, 500]:
    for ngram in [1,2,3,4,5]:
        for method in ["hs", "ns", "softmax"]:
            for neg in [5,10,15,20]:
                model =fasttext.train_supervised(input=os.path.join(PATH_TO_DATASETS, "sentiment_data", "full_train_data.txt"), wordNgrams=ngram, neg=neg,dim=dim, lr=0.005, epoch=500, loss=method, verbose=1, label_prefix='__label__')
                dev_results = model.predict(list(val_polemo["text"].values))
                annotation_results = model.predict(list(val_tweets["text"].values))
                dims.append(dim)
                ngrams.append(ngram)
                losses.append(method)
                negs.append(neg)
                tweets_f1_scores.append(f1_score(annotation_results[0],list(val_tweets['label'].values),average='macro'))
                dev_f1_scores.append(f1_score(dev_results[0],list(val_polemo['label'].values),average='macro'))
                print(f"Loss method - {method}")
                print(f"Dim - {dim}")
                print(f"Ngram - {ngram}")
                print(f"Negative samples - {neg}")
                print(f"F1-score for all tweets: {tweets_f1_scores[-1]}")
                print(f"F1-score for dev set: {dev_f1_scores[-1]}")
                print()

results = pd.DataFrame(data={"dim": dims,
                             "ngram" : ngrams,
                             "loss": losses,
                             "neg": negs,
                             "tweets_f1_score": tweets_f1_scores,
                             "dev_f1_score": dev_f1_scores})

results.to_csv(os.path.join("..","reports","sentiment_classification_results.csv"))

Loss method - hs
Dim - 300
Ngram - 1
Negative samples - 5
F1-score for all tweets: 0.34984710896960713
F1-score for dev set: 0.5623942811063802

Loss method - hs
Dim - 300
Ngram - 1
Negative samples - 10
F1-score for all tweets: 0.34984710896960713
F1-score for dev set: 0.5629298826329951

Loss method - hs
Dim - 300
Ngram - 1
Negative samples - 15
F1-score for all tweets: 0.34984710896960713
F1-score for dev set: 0.562571859518963

Loss method - hs
Dim - 300
Ngram - 1
Negative samples - 20
F1-score for all tweets: 0.3500344840528849
F1-score for dev set: 0.5629021398226923

Loss method - ns
Dim - 300
Ngram - 1
Negative samples - 5
F1-score for all tweets: 0.36152677445247106
F1-score for dev set: 0.5582780004991592

Loss method - ns
Dim - 300
Ngram - 1
Negative samples - 10
F1-score for all tweets: 0.3601970461953892
F1-score for dev set: 0.5592922691828756

Loss method - ns
Dim - 300
Ngram - 1
Negative samples - 15
F1-score for all tweets: 0.367239299716516
F1-score for dev set: 0.556

In [25]:
lrs = []
epochs = []
tweets_f1_scores = []
dev_f1_scores = []
for lr in [0.001, 0.005, 0.0001, 0.0005]:
    for epoch in [100,250,500,1000]:
        model =fasttext.train_supervised(input=os.path.join(PATH_TO_DATASETS, "sentiment_data", "full_train_data.txt"), wordNgrams=5, neg=5,dim=300, lr=lr, epoch=epoch, loss="ns", verbose=1, label_prefix='__label__')
        dev_results = model.predict(list(val_polemo["text"].values))
        annotation_results = model.predict(list(val_tweets["text"].values))
        lrs.append(lr)
        epochs.append(epoch)
        tweets_f1_scores.append(f1_score(annotation_results[0],list(val_tweets['label'].values),average='macro'))
        dev_f1_scores.append(f1_score(dev_results[0],list(val_polemo['label'].values),average='macro'))
        print(f"Learning rate - {lr}")
        print(f"Epochs - {epoch}")
        print(f"F1-score for all tweets: {tweets_f1_scores[-1]}")
        print(f"F1-score for dev set: {dev_f1_scores[-1]}")
        print()

Learning rate - 0.001
Epochs - 100
F1-score for all tweets: 0.07723577235772357
F1-score for dev set: 0.13483288855000636

Learning rate - 0.001
Epochs - 250
F1-score for all tweets: 0.28369486793399834
F1-score for dev set: 0.28733619777780717

Learning rate - 0.001
Epochs - 500
F1-score for all tweets: 0.3450963718820861
F1-score for dev set: 0.4875128174787702

Learning rate - 0.001
Epochs - 1000
F1-score for all tweets: 0.44096459096459095
F1-score for dev set: 0.5607791230695027

Learning rate - 0.005
Epochs - 100
F1-score for all tweets: 0.3450963718820861
F1-score for dev set: 0.4872309231673328

Learning rate - 0.005
Epochs - 250
F1-score for all tweets: 0.4589617898441427
F1-score for dev set: 0.5740606491264695

Learning rate - 0.005
Epochs - 500
F1-score for all tweets: 0.4989115474077759
F1-score for dev set: 0.5780589677968582

Learning rate - 0.005
Epochs - 1000
F1-score for all tweets: 0.4441432764630343
F1-score for dev set: 0.5730638765390956

Learning rate - 0.0001
Ep

In [28]:
lr_epoch_results = pd.DataFrame(data={"lr": lrs,
                             "epoch" : epochs,
                             "tweets_f1_score": tweets_f1_scores,
                             "dev_f1_score": dev_f1_scores})

lr_epoch_results.to_csv(os.path.join("..","reports","sentiment_classification_lr_epoch_results.csv"), index=False)

In [29]:
model = fasttext.train_supervised(input=os.path.join(PATH_TO_DATASETS, "sentiment_data", "full_train_data.txt"), wordNgrams=5, neg=5,dim=300, lr=0.005, epoch=500, loss="ns", verbose=1, label_prefix='__label__')
test_results = model.predict(list(test_polemo["text"].values))
annotation_results = model.predict(list(test_tweets["text"].values))


In [32]:
print(f"F1-score for all tweets: {f1_score(annotation_results[0],list(test_tweets['label'].values),average='macro')}")
print(f"F1-score for dev set: {f1_score(test_results[0],list(test_polemo['label'].values),average='macro')}")
print()

F1-score for all tweets: 0.42461047925843143
F1-score for dev set: 0.5924934125716439



In [33]:
print("Classification report for political tweets")
print(classification_report(list(test_tweets['label'].values), annotation_results[0]))

print("Classification report for polemo data")
print(classification_report(list(test_polemo['label'].values), test_results[0]))


Classification report for political tweets
                    precision    recall  f1-score   support

__label__ambiguous       0.00      0.00      0.00         5
 __label__negative       0.52      0.62      0.57        24
  __label__neutral       0.49      0.69      0.57        26
 __label__positive       0.70      0.47      0.56        49

          accuracy                           0.54       104
         macro avg       0.43      0.45      0.42       104
      weighted avg       0.57      0.54      0.54       104

Classification report for polemo data
                    precision    recall  f1-score   support

__label__ambiguous       0.48      0.23      0.31       681
 __label__negative       0.63      0.84      0.72      2123
  __label__neutral       0.69      0.61      0.65      1419
 __label__positive       0.75      0.66      0.70      1522

          accuracy                           0.66      5745
         macro avg       0.64      0.58      0.59      5745
      weighted

In [43]:
test_texts = list(test_tweets["text"].values)
test_labels = list(test_tweets["label"].values)
print("Tweets which were incorrrectly predicted:")
for i in range(len(test_texts)):
    pred = model.predict([test_texts[i]])
    if test_labels[i] != pred[0][0][0]:
        print(f"Tweet text: {test_texts[i]}")
        print(f"\t True label: {test_labels[i]}, predicted label: {pred[0][0][0]}, probability: {pred[1][0][0]}")
        print()

Tweets which were incorrrectly predicted:
Tweet text: dzisiaj spotkanie śzgip z parlamentarzystami z woj. śl. dobra frekwencja. nikt z posłów nie bronił krytykowanych przez samorząd projektów.
	 True label: __label__positive, predicted label: __label__negative, probability: 0.14805719256401062

Tweet text: ✔️projekt gotowy! <dłoń z palcem wskazującym w prawo> dotrzymuję słowa. 18 maja 2019 r. podczas spotkania z emerytowanymi górnikami w wałbrzychu obiecałam przygotować nowelizację ustawy z dnia 17 grudnia 1998 r. o emeryturach i rentach z funduszu ubezpieczeń społecznych. sejmrp nowelizacja emeryci górnicy
	 True label: __label__positive, predicted label: __label__neutral, probability: 0.44553956389427185

Tweet text: premier jest już w pszczyna dotrzymujemysłowa
	 True label: __label__positive, predicted label: __label__neutral, probability: 0.11597072333097458

Tweet text: to jest skandal!
	 True label: __label__negative, predicted label: __label__ambiguous, probability: 0.001558761