In [63]:
import jsonlines
import json
import os
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

In [64]:
DATA_PATH = '../data'

with jsonlines.open(os.path.join(DATA_PATH, 'train.jsonl'), 'r') as f:
    train_data = list(f)
    
with jsonlines.open(os.path.join(DATA_PATH, 'validation.jsonl'), 'r') as f:
    val_data = list(f)
    
with jsonlines.open(os.path.join(DATA_PATH, 'test.jsonl'), 'r') as f:
    test_data = list(f)
    
print("Train data size:", len(train_data))
print("Validation data size:", len(val_data))
print("Test data size:", len(test_data))

Train data size: 189
Validation data size: 21
Test data size: 42


In [65]:
print(set(i['game_id'] for i in train_data))
print(set(i['game_id'] for i in val_data))
print(set(i['game_id'] for i in test_data))

{1, 2, 3, 5, 6, 7, 8, 9, 10}
{11}
{12, 4}


In [66]:
def format_dataset(dataset):
    new_dataset = []
    for i in dataset:
        for j in range(0, len(i['messages'])):
            new_dataset.append({
                'message': i['messages'][j],
                'sender_annotation': i['sender_labels'][j],
                'receiver_annotation': i['receiver_labels'][j],
                'score_delta': i['game_score_delta'][j]
            })
    return new_dataset

In [67]:
train_data = format_dataset(train_data)
val_data = format_dataset(val_data)
test_data = format_dataset(test_data)

In [68]:
nlp = English()

def spacy_tokenizer(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        if token.like_num:
            tokens.append("_NUM_")
        else:
            tokens.append(token.text)
    return tokens

vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words=list(spacy.lang.en.stop_words.STOP_WORDS), strip_accents='unicode')
train_vectors = vectorizer.fit_transform([i['message'].lower() for i in train_data])

test_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words=list(spacy.lang.en.stop_words.STOP_WORDS), strip_accents='unicode', vocabulary=vectorizer.vocabulary_)
test_vectors = test_vectorizer.fit_transform([i['message'].lower() for i in test_data])



In [87]:
log_model = LogisticRegression(max_iter=1000, class_weight='balanced')

log_model.fit(train_vectors, [0 if i['sender_annotation'] == False else 1 for i in train_data])
predictions = log_model.predict(test_vectors)
print(classification_report([0 if i['sender_annotation'] == False else 1 for i in test_data], predictions, zero_division=0, digits=3))

              precision    recall  f1-score   support

           0      0.142     0.242     0.179       240
           1      0.922     0.860     0.890      2501

    accuracy                          0.806      2741
   macro avg      0.532     0.551     0.534      2741
weighted avg      0.854     0.806     0.828      2741

