### Importing packages

In [1]:
# Libraries

# Reading in files
import pandas as pd
import numpy as np

import spacy
import scipy.sparse as sp
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from keybert import KeyBERT
from flair.embeddings import TransformerDocumentEmbeddings

import yake

import sys
import time

from tqdm import tqdm

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'keybert'

In [None]:
pd.options.display.max_colwidth = 5000

### Reading in data

In [None]:
df_ACL_2020 = pd.read_csv("./Data/Pred/BART/ACL_2022_bart_pred_231122.csv")
df_EMNLP_2020 = pd.read_csv("./Data/Pred/BART/EMNLP_2020_bart_pred_231122.csv")

df_ACL_2020 = df_ACL_2020[["Labels", "Paper Name", "abstract"]]
df_EMNLP_2020 = df_EMNLP_2020[["Labels", "Paper Name", "abstract"]]

df_ACL_2020.columns = ["Label", "Title", "Abstract"]
df_EMNLP_2020.columns = ["Label", "Title", "Abstract"]

df_ACL_2020 = df_ACL_2020.loc[lambda df_ACL_2020: ~df_ACL_2020["Label"].isin(["Student Research Workshop", "Theme", "NLP Applications", "System Demonstrations"]), :]
df_ACL_2020["Label"].unique()


df_ACL_2020

In [None]:
df_EMNLP_2020 = df_EMNLP_2020.loc[lambda df_EMNLP_2020: ~df_EMNLP_2020["Label"].isin(["Student Research Workshop", "Theme", "NLP Applications", "System Demonstrations"]), :]

In [None]:
df_ACL_2020['Text'] = df_ACL_2020['Title'] + " " +df_ACL_2020['Abstract']
df_EMNLP_2020['Text'] = df_EMNLP_2020['Title'] + " " +df_EMNLP_2020['Abstract']

nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
stemmer = SnowballStemmer("english")

def lemmatiser_stemmer_stopword(text, nlp, stemmer):
    
    doc = nlp(text)
    lemmatised_sentence_lst = [token.lemma_ for token in doc if not token.is_stop]
    lemmatised_sentence = " ".join(lemmatised_sentence_lst)
    stemmed_lemmatised_sentence = stemmer.stem(lemmatised_sentence)
    
    return stemmed_lemmatised_sentence.lower()

In [None]:
df_ACL_2020['Lemm Stemmed Text'] = df_ACL_2020['Text'].progress_apply(lemmatiser_stemmer_stopword, args=(nlp, stemmer))

In [None]:
df_EMNLP_2020['Lemm Stemmed Text'] = df_EMNLP_2020['Text'].progress_apply(lemmatiser_stemmer_stopword, args=(nlp, stemmer))

### Keyword extraction

In [None]:
def keybert_keyword_extraction(input_text, kw_model, use_maxsum=True, use_mmr=False, ngram=3, topn=10, nr_cand=20, div=0.5):
    if use_maxsum:
        keywords_res = kw_model.extract_keywords(input_text, keyphrase_ngram_range=(1, ngram), stop_words='english',
                                        top_n=topn, use_maxsum=True, nr_candidates=nr_cand)
    elif use_mmr:
        keywords_res = kw_model.extract_keywords(input_text, keyphrase_ngram_range=(1, ngram), stop_words='english',
                                        top_n=topn, use_mmr=True, diversity=div)
    keyword_str = "#".join([kw[0] for kw in keywords_res])
    return keyword_str

In [None]:
def yake_keyword_extraction(input_text, kw_model, useless):
    keywords_res = kw_model.extract_keywords(input_text)
    keyword_str = "#".join([kw[0] for kw in keywords_res])
    return keyword_str

In [None]:
def format_str(keyword_str):
    keyword_str = keyword_str.replace(" ", "_")
    keyword_str = keyword_str.replace("#", " ")
    return keyword_str

#format_str(df_ACL_2020["Keyword"][0])

### Different setting of keyword extraction

In [None]:
# plm_name: roberta-base, allenai/scibert_scivocab_uncased, allenai/specter
plm_name = "allenai/scibert_scivocab_uncased"
plm = TransformerDocumentEmbeddings(plm_name)
kw_model = KeyBERT(model=plm)
    
use_maxsum = False
topn = 10
nr_cand = 20
use_mmr = True
ngram = 1
df_ACL_2020['Keyword'] = df_ACL_2020['Text'].progress_apply(keybert_keyword_extraction, args=(kw_model, use_maxsum, use_mmr, ngram, topn, nr_cand))

df_ACL_2020.to_csv("./Data/ACL_2020_keywords10_mmr_unigram_nostem_261222.csv", index = False)

In [None]:
# plm_name: roberta-base, allenai/scibert_scivocab_uncased, allenai/specter
plm_name = "allenai/specter"
plm = TransformerDocumentEmbeddings(plm_name)
kw_model = KeyBERT(model=plm)
    
use_maxsum = False
topn = 10
nr_cand = 20
use_mmr = True
ngram = 1
df_ACL_2020['Keyword'] = df_ACL_2020['Lemm Stemmed Text'].progress_apply(keybert_keyword_extraction, args=(kw_model, use_maxsum, use_mmr, ngram, topn, nr_cand))

df_ACL_2020.to_csv("./Data/ACL_2020_keywords10_mmr_unigram_specter_261222.csv", index = False)

In [None]:
dedup_func='seqm'
dedup_thred=0.7
ngram=2
wind_size=1
top_n=20
kw_model = yake.KeywordExtractor(n=ngram, dedupLim=dedup_thred, dedupFunc=dedup_func, windowsSize=wind_size, top=top_n)
useless=True

df_ACL_2020['Keyword'] = df_ACL_2020['Lemm Stemmed Text'].progress_apply(yake_keyword_extraction, args=(kw_model, useless))

df_ACL_2020.to_csv("./Data/ACL_2020_keywords_yake_030123.csv", index = False)

In [None]:
df_EMNLP_2020['Keyword'] = df_EMNLP_2020['Lemm Stemmed Text'].progress_apply(yake_keyword_extraction, args=(kw_model, useless))

### Keyword matching based topic classification

In [None]:
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_mmr_161222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_mmr_bigram_161222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_maxsum_231222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_maxsum_bigram_231222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_maxsum_unigram_251222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_maxsum_unigram_251222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_maxsum_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_maxsum_bigram_specter_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_bigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords20_mmr_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords20_mmr_bigram_specter_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords30_mmr_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords40_mmr_unigram_specter_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords20_mmr_unigram_specter_div7_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_roberta_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_specter_nostem_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_nostem_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_roberta_nostem_261222.csv")
df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_yake_030123.csv")

In [None]:
df_ACL_load

### Training on ACL2020

Obtain topic_keywords{topic: \[keywords\]}

In [None]:
df_train = df_ACL_load
text_label = "Keyword" # Extract keyword from which column, "Keyword" or "Lemm Stemmed Text", or "Text"
topk = 10 # 10, 20, 30, 40

In [None]:
# used for keyword only, convert from 'kc kc#kw2#kc kc3' to "kc_kc kw2 kc_kc3"
df_train["Keyword"] = df_train["Keyword"].apply(format_str)

In [None]:
topic_docs = df_train.groupby(['Label'], as_index=False).agg({text_label: ' '.join})

In [None]:
tfidf_model = TfidfVectorizer()
tfidf_vals = tfidf_model.fit_transform(topic_docs[text_label])
keyword_feas = tfidf_model.get_feature_names()

In [None]:
if text_label == "Keyword": 
    ngram_range=1
elif text_label == "Lemm Stemmed Text": 
    ngram_range=(1, 2)

count_vectorizer = CountVectorizer(ngram_range).fit(topic_docs[text_label])
count = count_vectorizer.transform(topic_docs[text_label])
words = count_vectorizer.get_feature_names()

In [None]:
class CTFIDFVectorizer(TfidfTransformer):
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=np.float64)
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF """
        X = X * self._idf_diag
        X = normalize(X, axis=1, norm='l1', copy=False)
        return X

In [None]:
# Extract top _topk_ words per class
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(df_train)).toarray()

topic_keywords = {topic_docs['Label'].iloc[label]: [words[index].replace("_", " ") for index in ctfidf[label].argsort()[-topk:]  if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}
topic_keywords_val = {topic_docs['Label'].iloc[label]: [(words[index].replace("_", " "), ctfidf[label][index]) for index in ctfidf[label].argsort()[-topk:] if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}

#topic_keywords = {topic_docs['Label'].iloc[label]: [lemmatiser_stemmer_stopword(words[index].replace("_", " "), nlp, stemmer) for index in ctfidf[label].argsort()[-topk:]  if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}
#topic_keywords_val = {topic_docs['Label'].iloc[label]: [(lemmatiser_stemmer_stopword(words[index].replace("_", " "), nlp, stemmer), ctfidf[label][index]) for index in ctfidf[label].argsort()[-topk:] if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}



In [None]:
topic_keywords_val

### Testing on ACL2020/EMNLP2020

In [None]:
df_test = df_EMNLP_2020

In [None]:
def keyword_count(text): 

    keyword_count_dict = {}
    keyword_list_dict = {}

    for label in topic_keywords.keys():

        count = 0
        keywords = []

        list_of_key_words = topic_keywords[label]

        for keyword in list_of_key_words:

            count += text.count(keyword)
            if keyword in text: keywords.append(keyword)

        keyword_count_dict[label] = count
        keyword_list_dict[label] = keywords
    
    return keyword_list_dict, keyword_count_dict, max(keyword_count_dict, key=keyword_count_dict.get)

In [None]:
# count keywords from "Text" or "Lemm Stemmed Text" or "Keyword", according to which column to extract keyword
df_pred = df_test.apply(lambda row: keyword_count(row['Lemm Stemmed Text']), axis='columns', result_type='expand')
df_pred.columns = ["Matched Keywords", "Dictionary Output", "Predicted Label"]

df_test = pd.concat([df_test, df_pred], axis='columns')

In [None]:
df_test

In [None]:
df_test["Label Outcome"] = df_test.apply(lambda x: x["Label"] == x["Predicted Label"], axis = 1)
df_test_outcome = df_test[['Label', 'Label Outcome']].groupby(['Label', 'Label Outcome']).size().reset_index(name='Counts')
df_test_outcome = df_test_outcome.sort_values(by = ['Label','Label Outcome'], ascending = [True, False])

import plotly.express as px

fig = px.bar(df_test_outcome, x="Label", y="Counts", color="Label Outcome", title="Predictions for ACL Dataset",
             width=900, height=800)
fig.show()

In [None]:
idx_labels = list(df_test["Label"].unique())

cm = confusion_matrix(df_test['Label'], df_test['Predicted Label'], labels = idx_labels)
cm_df = pd.DataFrame(cm,
                     index = idx_labels, 
                     columns = idx_labels)

#Plotting the confusion matrix
plt.figure(figsize=(12,12))
sns.heatmap(cm_df, annot=True, cmap="Blues")
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
print(classification_report(df_test['Label'], df_test['Predicted Label']))

In [None]:
df_test_accuracy = df_test_outcome.pivot(index="Label", columns="Label Outcome", values="Counts").reset_index().fillna(0)
df_test_accuracy["Accuracy"] = df_test_accuracy[True] / (df_test_accuracy[False] + df_test_accuracy[True]) * 100
df_test_accuracy = df_test_accuracy.sort_values(by = 'Accuracy', axis=0, ascending=False)
df_test_accuracy