In [158]:
import sys
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
from swisscom import launch
import en_core_web_sm as english
from tqdm.notebook import tqdm
from string import punctuation
import warnings
warnings.filterwarnings("ignore")

In [159]:
sys.path.append("..")


data = pd.read_csv('./dataset.csv', dtype=str)
nlp = english.load()

In [160]:
X = data['text']
y = data['loc'].astype(str)

In [161]:
BACKGROUND_TAGS = ["valley", "hill", "mountain", "city", "sea", "bay", "beach", "forest", "field", "road", "urban", "rural", "highway", "modern building", "historical building", "ancient ruins", "tropics", "desert", "swamp", "lake", "outskirts", "luxury"]

In [162]:
def extract_keywords(nlp, sequence, special_tags : list = None):
    result = []

    pos_tag = ['PROPN','NOUN','ADJ']

    doc = nlp(sequence.lower())

    if special_tags:
        tags = [tag.lower() for tag in special_tags]
        for token in doc:
            if token.text in tags:
                result.append(token.text)

    for chunk in doc.noun_chunks:
        final_chunk = ""
        for token in chunk:
            if token.pos_ in pos_tag:
                final_chunk =  final_chunk + token.text + " "
        if final_chunk:
            result.append(final_chunk.strip())


    for token in doc:
        if token.text in nlp.Defaults.stop_words or token.text in punctuation:
            continue
        if token.pos_ in pos_tag:
            result.append(token.text)

    return list(set(result))

In [163]:
def predict_spacy(nlp, X):
    doc = list(map(lambda text: extract_keywords(nlp, text, BACKGROUND_TAGS), X))
    return doc

In [164]:
class KeyPhraseExtractionModel(BaseEstimator):
    def __init__(self, embedding_model='roberta-large-nli-stsb-mean-tokens', beta=0.8, alias_threshold=0.7, n=1):
        self.X = None
        self.y = None
        self.beta = beta
        self.n = n
        self.alias_threshold = alias_threshold
        self.embedding_distributor = launch.load_local_embedding_distributor(embedding_model)
        self.pos_tagger = launch.load_local_corenlp_pos_tagger()


    def fit(self, X, y):
        self.X = X
        self.y = y


    def predict(self, X):
        return list(map(lambda text: launch.extract_keyphrases(self.embedding_distributor, self.pos_tagger, text, self.n, 'en', self.beta, self.alias_threshold)[0][0], X))

In [165]:
models = [
    'sent2vec',
    'roberta-large-nli-stsb-mean-tokens',
    'roberta-base-nli-stsb-mean-tokens',
    'distilbert-base-nli-stsb-mean-tokens',
    'paraphrase-distilroberta-base-v1',
    'distilroberta-base-msmarco-v2',
    'LaBSE',
]

model_map_name = {
    'sent2vec': 'EmbedRank with sent2vec',
    'roberta-large-nli-stsb-mean-tokens': 'RoBERTa Large ',
    'roberta-base-nli-stsb-mean-tokens': 'RoBERTa Base',
    'distilbert-base-nli-stsb-mean-tokens': 'DistilBERT Base',
    'paraphrase-distilroberta-base-v1': 'DistilBERT Paraphrase v1',
    'distilroberta-base-msmarco-v2': 'DistilRoBERTa Base msmarco',
    'LaBSE': 'LaBSE',
}

In [166]:
model_results = {}

In [167]:
def spacy_collect(y_true, y_pred):
    result = []
    for y_t, y_p in list(zip(y_true, y_pred)):
        if y_t in y_p:
            result.append(y_t)
        else:
            result.append(y_p[0])
    return result

In [168]:
spacy_score = accuracy_score(y_true=y, y_pred=spacy_collect(y, predict_spacy(nlp, X)))
model_results['Spacy BERT'] = '{}%'.format(int(100. * spacy_score))

for embedding_model in tqdm(list(models)):
    model = KeyPhraseExtractionModel(embedding_model)
    model.fit(X, y)
    score = accuracy_score(y_true=y, y_pred=model.predict(X))
    model_results[model_map_name[embedding_model]] = '{}%'.format(int(100. * score))

  0%|          | 0/7 [00:00<?, ?it/s]

In [169]:
data = {
    'Model': model_results.keys(),
    'Accuracy': model_results.values()
}

df = pd.DataFrame(data, columns=['Model', 'Accuracy'])
df.style.hide_index()

Model,Accuracy
Spacy BERT,64%
EmbedRank with sent2vec,78%
RoBERTa Large,47%
RoBERTa Base,44%
DistilBERT Base,46%
DistilBERT Paraphrase v1,60%
DistilRoBERTa Base msmarco,64%
LaBSE,57%
