In [1]:
import os

# Install multilingual Roberta
!pip install -U sentence-transformers

# Flair
!pip install flair

# Install BLEURT
!git clone https://github.com/google-research/bleurt.git
!pip install bleurt/

# Install NLTK package
import nltk
nltk.download('punkt')


Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.4 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 25.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.2 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.15-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 45.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |███

True

In [3]:
import os
import sys
import nltk
import json
import csv 
import operator
import logging
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity
from bleurt import score
from flair.data import Sentence
from flair.models import SequenceTagger

# tf.get_logger().setLevel(logging.ERROR)
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.getLogger('tensorflow').disabled = True

# mount google drive
drive.mount('/content/gdrive')

# define taggers (uncomment if you want to use Flair NER taggers)
tagger_NL = None
tagger_DE = None
tagger_EN = None

'''
tagger_NL = SequenceTagger.load("flair/ner-dutch")
tagger_DE = SequenceTagger.load("flair/ner-german")
tagger_EN = SequenceTagger.load("flair/ner-english")
'''

def get_similarity(sum, rep):
    # put all sentence embeddings in a matrix
    e_col = 'sentence_embedding_bert'

    embed_mat_sum = sum
    embed_mat_rep = rep

    # calculate distance between every embedding pair
    sim_mat = cosine_similarity(embed_mat_sum, embed_mat_rep)

    return sim_mat

def get_similarity_bleurt(rep_sentence, sum_sentences_index, df_sum_text):
    sim_mat = []

    for sum_sent_index in sum_sentences_index:
        sum_sent = df_sum_text[sum_sent_index]
        references = tf.constant([rep_sentence])
        candidates = tf.constant([sum_sent])

        # Perform scoring
        bleurt_ops = score.create_bleurt_ops()
        bleurt_out = bleurt_ops(references=references, candidates=candidates)
        assert bleurt_out["predictions"].shape == (1,)

        # Print results
        bleurt_score = bleurt_out["predictions"].numpy()[0]
        sim_mat.append(bleurt_score)

    return sim_mat


def get_similarity_bleurt_string(sentence_1, sentence_2):
    references = tf.constant([sentence_1])
    candidates = tf.constant([sentence_2])

    # Perform scoring
    bleurt_ops = score.create_bleurt_ops()
    bleurt_out = bleurt_ops(references=references, candidates=candidates)
    assert bleurt_out["predictions"].shape == (1,)

    # Print results
    bleurt_score = bleurt_out["predictions"].numpy()[0]

    return bleurt_score


def get_similarity_embeddings_string(sentence_1, sentence_2, embedding_model):
    df_sentence_1 = pd.DataFrame({"text": [sentence_1]})
    df_sentence_2 = pd.DataFrame({"text": [sentence_2]})
    embeddings_sentence_1 = embedding_model.encode(df_sentence_1.text)
    embeddings_sentence_2 = embedding_model.encode(df_sentence_2.text)

    sim_mat = cosine_similarity(embeddings_sentence_1, embeddings_sentence_2)[0][0]

    return sim_mat


def get_entities(json_path):
    with open(json_path) as f:
        match_data = json.load(f)
        
    entity_counter = 0
    entity_dic = {}

    for event in match_data:
        if 'primaryPlayer' in event:
            first_name = None;
            last_name = None;
            position = None;

            # Get player and ID
            player = event['primaryPlayer']
            id = player['id']

            # Check if already exists
            if id in entity_dic:
                entry = entity_dic[id]
                first_name = entry[0];
                last_name = entry[1];
                #position = entry[2];               

            # First name
            if 'displayFirstName' in player and first_name == None:
                first_name = player['displayFirstName']

            # Last name
            if 'displayLastName' in player and last_name == None:
                last_name = player['displayLastName']

            # Position
            if 'fieldPosition' in player and position == None:
                position = player['fieldPosition']

            # Create entity
            entity_dic[id] = (first_name, last_name)

    return entity_dic

def get_sentence_entities(sentence, entity_dic):
    tokens = nltk.word_tokenize(sentence)
    tokens = [x.lower() for x in tokens]
    entity_list = []

    for entity_id, entity_tuple in entity_dic.items():
        for entity_part in entity_tuple:
            if entity_part != None and entity_part.lower() in tokens:
                entity_list.append(entity_id)

    return entity_list

def get_sentence_time(sentence):
    sentence_minute = None

    # First detect whether "min" appears in sentence
    tokens = nltk.word_tokenize(sentence)
    tokens = [x.lower() for x in tokens]

    for token_index, token in enumerate(tokens):
        max_index = len(tokens)-1
        if "min" in token:
            lower_index = token_index - 2
            upper_index = 1 + token_index + 2
            if lower_index < 0:
                lower_index = 0
            if upper_index > max_index:
                upper_index = max_index

            token_selection = tokens[lower_index:upper_index]
            for current_token in token_selection:
                if current_token.isnumeric():
                    sentence_minute = int(current_token)

    return sentence_minute


def get_bleurt_score_for_set(rep_sent, sum_sent_index_set, df_sum_text):
    total_score = 0
    for sum_sent_index in sum_sent_index_set:
        sum_sent = df_sum_text[sum_sent_index]
        references = tf.constant([rep_sent])
        candidates = tf.constant([sum_sent])

        # Perform scoring
        bleurt_ops = score.create_bleurt_ops()
        bleurt_out = bleurt_ops(references=references, candidates=candidates)
        assert bleurt_out["predictions"].shape == (1,)

        # Get result
        bleurt_score = bleurt_out["predictions"].numpy()[0]

        # Add result
        total_score += bleurt_score

    return total_score


def get_NER_tags(sentence_string, tagger_model):
    sentence = Sentence(sentence_string)
    tagger_model.predict(sentence)

    ner_entity_list = []
    for entity in sentence.to_dict('ner')['entities']:
        ner_entity_list.append(entity)

    return ner_entity_list


def check_NER_tags(ner_entity_list):
    person_present = False
    for entity in ner_entity_list:
        if entity['labels'][0].value == 'PER':
            person_present = True

    return person_present


def save_annotations(annotation_file, annotation_dict):
    lines = []
    for key, values in annotation_dict.items():
        # 0 - match_id
        # 1 - report_id_1
        # 2 - report_sent_1
        # 3 - report_id_2
        # 4 - report_sent_2
        lines.append('{}\t{}\t"{}"\t{}\t"{}"\n'.format(key, values[0], values[1], values[2], values[3]))
    
    with open(annotation_file, "w", encoding='utf-8') as f:
        f.writelines(lines)


def main():
    # Model parameters, feel free to change if required.
    summary_sentence_amount = 10
    summary_sentence_match_minimum = 3

    # Whether to use embeddings or BLEURT for similarity scoring
    use_embeddings_for_scoring = False

    # Change this to the match id that you want to extract alignment pairs from
    match_id_input = "2002833"

    # Load Roberta multilingual
    from sentence_transformers import SentenceTransformer
    embedding_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

    # Load data from Google Drive (Change these paths to where your match report dataset and structured data files are)
    txt_directory = "/content/gdrive/MyDrive/Master_Scriptie/colab/match_reports_dataset"
    sum_directory = "/content/gdrive/MyDrive/Master_Scriptie/colab/events_all"
    for dirname in os.listdir(txt_directory):
        match_id = dirname
        if match_id == match_id_input:
            print()
            print("Match ID:", match_id)

            # get file paths
            txt_file_dir = os.path.join(txt_directory, dirname)
            json_file = os.path.join(sum_directory, match_id + ".json")
            csv_file = os.path.join(sum_directory, match_id + "_summary.csv")
            
            # retrieve entities and ID's
            entity_dic = get_entities(json_file)

            # get dataframe
            df_sum = pd.read_csv(csv_file)

            # get entities in a sentence
            entity_list_column = []
            for sentence in df_sum.autoText:
                entity_list = get_sentence_entities(sentence, entity_dic)
                entity_list_column.append(entity_list)
            df_sum["entity_list"] = entity_list_column
            df_sum.rename(columns = {'autoText':'text'}, inplace = True)

            # get summary embeddings
            summary_embeddings = embedding_model.encode(df_sum.text)

            # Save each report sentence with the sum_sent_index
            rep_sent_list = []

            # Go over each report for a match
            all_report_enabled = True
            for report_name in os.listdir(txt_file_dir):
                # go over single file
                report_path = os.path.join(txt_file_dir, report_name)
                with open(report_path, "r", encoding="UTF-8") as f:
                    # Get report language
                    tagger_model = tagger_EN
                    if "NL-" in report_path:
                        tagger = tagger_NL
                    if "DE-" in report_path:
                        tagger = tagger_DE

                    if "EN-" in report_path or all_report_enabled:
                        print("Report Name:", report_name)

                        # Get sentences
                        report_string = f.read()
                        report_sentences = nltk.sent_tokenize(report_string)
                        df_rep = pd.DataFrame({"text": report_sentences})

                        for rep_sent in report_sentences:
                            get_sentence_time(rep_sent)

                        # Get embeddings
                        report_embeddings = embedding_model.encode(df_rep.text)

                        # Get embedding similarity
                        sim_mat = get_similarity(report_embeddings, summary_embeddings)
                        sim_mat_rep = get_similarity(report_embeddings, report_embeddings)

                        # Go over every sentence in report
                        for rep_sent_index, rep_sent_row in enumerate(sim_mat):
                            # Get report sentence
                            rep_sent = df_rep.text[rep_sent_index]
                            rep_sent_minute = get_sentence_time(rep_sent)

                            # set summary sentence index set
                            rep_sent_sum_sent_index_set = set()

                            # check if the sentence has any NER person tags
                            NER_disabled = True
                            ner_entity_list = []
                            if not NER_disabled:
                                ner_entity_list = get_NER_tags(rep_sent, tagger_model)

                            if check_NER_tags(ner_entity_list) or NER_disabled:
                                # Check for entities
                                rep_entity_list = get_sentence_entities(rep_sent, entity_dic)

                                # Get highes indices for a report sentence
                                n_amount = 10
                                highest_cos_score_list = list(reversed(sorted(range(len(rep_sent_row)), key=lambda i: rep_sent_row[i])[-n_amount:]))

                                # Get entities for summary sentence and match accordingly
                                for index in highest_cos_score_list:
                                    sum_sent = df_sum.text[index]
                                    sum_minute = df_sum.minute[index]

                                    minutes_passed = 120
                                    if (rep_sent_minute != None):
                                        minutes_passed = rep_sent_minute

                                    sum_entity_list = get_sentence_entities(sum_sent, entity_dic)
                                    # Check which sum sentence contains report entities
                                    for rep_entity in rep_entity_list:
                                        if (rep_entity in sum_entity_list) and (sum_minute <= minutes_passed):
                                            rep_sent_sum_sent_index_set.add(index)

                                    # Stop adding new indices when 3 have already been added
                                    if len(rep_sent_sum_sent_index_set) >= summary_sentence_amount:
                                        break

                            # Save report sentence with sum_sentences_index
                            rep_sent_tuple = (rep_sent, rep_sent_sum_sent_index_set, report_name, rep_sent_index)
                            rep_sent_list.append(rep_sent_tuple)

            # Get report sentences with most similar sum sentences
            matched_report_sentences = {}

            for rep_sent_tuple in rep_sent_list:
                # Extract sentence information
                rep_sent_string = rep_sent_tuple[0]
                rep_sent_sum_set = rep_sent_tuple[1]
                rep_sent_report_name = rep_sent_tuple[2]
                rep_sent_index = rep_sent_tuple[3]

                # Highest score init
                rep_sent_highest_score = None
                rep_sent_highest_score_tuple = None

                rep_sent_score_list = []
                
                # Compare report sentence to all other report sentences and find most common one
                for rep_sent_tuple_compare in rep_sent_list:
                    rep_sent_compare_string = rep_sent_tuple_compare[0]
                    rep_sent_compare_sum_set = rep_sent_tuple_compare[1]
                    rep_sent_compare_report_name = rep_sent_tuple_compare[2]
                    rep_sent_compare_index = rep_sent_tuple[3]

                    if rep_sent_report_name != rep_sent_compare_report_name:
                        # Only allow sentences that have the minimum of required matches
                        set_intersection = rep_sent_sum_set & rep_sent_compare_sum_set
                        set_intersection_len = len(set_intersection)
                        if set_intersection_len > summary_sentence_match_minimum:
                            # calculate the bleurt score for the intersection to allow comparision with other matches
                            if use_embeddings_for_scoring:
                                total_score = get_similarity_embeddings_string(rep_sent_string, rep_sent_compare_string, embedding_model)
                            else:
                                total_score = get_similarity_bleurt_string(rep_sent_string, rep_sent_compare_string)

                            ann_id = "{}_{}_{}_{}".format(rep_sent_tuple[2], rep_sent_tuple[3], rep_sent_tuple_compare[2], rep_sent_tuple_compare[3])
                            ann_report_1 = rep_sent_tuple[2]
                            ann_sent_1 = rep_sent_tuple[0]
                            ann_report_2 = rep_sent_tuple_compare[2]
                            ann_sent_2 = rep_sent_tuple_compare[0]
                            matched_report_sentence = [ann_report_1, ann_sent_1, ann_report_2, ann_sent_2]

                            rep_sent_score_list.append((total_score, matched_report_sentence, ann_id))

                            if (rep_sent_highest_score == None or total_score > rep_sent_highest_score):
                                rep_sent_highest_score = total_score
                                rep_sent_highest_score_tuple = rep_sent_tuple_compare


                # Sort list with scores
                rep_sent_score_list.sort(key=lambda x:x[0])
                rep_sent_score_list.reverse()

                # Add top matches
                k_amount = 5
                for score_tuple in rep_sent_score_list[:k_amount]:
                    matched_report_sentence = score_tuple[1]
                    ann_id = score_tuple[2]
                    matched_report_sentences[ann_id] = matched_report_sentence

            # Print matched report sentences
            scoring_name_list = ["bleurt", "embeddings"]
            pred_file_name = "{}_pred_{}_{}.tsv".format(match_id, scoring_name_list[use_embeddings_for_scoring], k_amount)
            save_annotations(pred_file_name, matched_report_sentences)


if __name__ == "__main__":
    main()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).

Match ID: 2002833
Report Name: EN-2.txt
Report Name: NL-2.txt
Report Name: EN-3.txt
Report Name: NL-1.txt
Report Name: NL-3.txt
Report Name: DE-1.txt
Report Name: DE-3.txt
