### Importing packages

In [1]:
# Libraries

# Reading in files
import pandas as pd
import numpy as np

import spacy
import scipy.sparse as sp
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from keybert import KeyBERT
from flair.embeddings import TransformerDocumentEmbeddings

import yake

import sys
import time

from tqdm import tqdm

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'keybert'

In [None]:
pd.options.display.max_colwidth = 5000

### Reading in data

In [None]:
df_ACL_2020 = pd.read_csv("./Data/Pred/BART/ACL_2022_bart_pred_231122.csv")
df_EMNLP_2020 = pd.read_csv("./Data/Pred/BART/EMNLP_2020_bart_pred_231122.csv")

df_ACL_2020 = df_ACL_2020[["Labels", "Paper Name", "abstract"]]
df_EMNLP_2020 = df_EMNLP_2020[["Labels", "Paper Name", "abstract"]]

df_ACL_2020.columns = ["Label", "Title", "Abstract"]
df_EMNLP_2020.columns = ["Label", "Title", "Abstract"]

df_ACL_2020 = df_ACL_2020.loc[lambda df_ACL_2020: ~df_ACL_2020["Label"].isin(["Student Research Workshop", "Theme", "NLP Applications", "System Demonstrations"]), :]
df_ACL_2020["Label"].unique()


df_ACL_2020

In [None]:
df_EMNLP_2020 = df_EMNLP_2020.loc[lambda df_EMNLP_2020: ~df_EMNLP_2020["Label"].isin(["Student Research Workshop", "Theme", "NLP Applications", "System Demonstrations"]), :]

In [None]:
df_ACL_2020['Text'] = df_ACL_2020['Title'] + " " +df_ACL_2020['Abstract']
df_EMNLP_2020['Text'] = df_EMNLP_2020['Title'] + " " +df_EMNLP_2020['Abstract']

nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
stemmer = SnowballStemmer("english")

def lemmatiser_stemmer_stopword(text, nlp, stemmer):
    
    doc = nlp(text)
    lemmatised_sentence_lst = [token.lemma_ for token in doc if not token.is_stop]
    lemmatised_sentence = " ".join(lemmatised_sentence_lst)
    stemmed_lemmatised_sentence = stemmer.stem(lemmatised_sentence)
    
    return stemmed_lemmatised_sentence.lower()

In [None]:
df_ACL_2020['Lemm Stemmed Text'] = df_ACL_2020['Text'].progress_apply(lemmatiser_stemmer_stopword, args=(nlp, stemmer))

In [None]:
df_EMNLP_2020['Lemm Stemmed Text'] = df_EMNLP_2020['Text'].progress_apply(lemmatiser_stemmer_stopword, args=(nlp, stemmer))

In [28]:
df_P20 = pd.read_csv("./Data/ACL_2020.csv")
df_D20 = pd.read_csv("./Data/EMNLP_2020.csv")
df_E21 = pd.read_csv("./Data/EACL_2021.csv")
df_P21 = pd.read_csv("./Data/ACL_JCNLP_2021.csv")
df_N21 = pd.read_csv("./Data/NAACL_2021.csv")

In [29]:
print(len(df_P20) + len(df_D20) + len(df_E21) + len(df_P21) + len(df_N21))
print(len(df_P20), len(df_P20.Labels), len(df_P20.Labels.unique()))
print(len(df_D20), len(df_D20.Labels), len(df_D20.Labels.unique()))
print(len(df_E21), len(df_E21.Labels), len(df_E21.Labels.unique()))
print(len(df_P21), len(df_P21.Labels), len(df_P21.Labels.unique()))
print(len(df_N21), len(df_N21.Labels), len(df_N21.Labels.unique()))

2894
791 791 23
781 781 20
320 320 21
528 528 22
474 474 22


In [30]:
df_P20 = df_P20[df_P20["Labels"] != "NLP Applications"]
df_D20 = df_D20[df_D20["Labels"] != "NLP Applications"]
df_E21 = df_E21[df_E21["Labels"] != "NLP Applications"]
df_P21 = df_P21[df_P21["Labels"] != "NLP Applications"]
df_N21 = df_N21[df_N21["Labels"] != "NLP Applications"]

In [31]:
print(len(df_P20) + len(df_D20) + len(df_E21) + len(df_P21) + len(df_N21))
print(len(df_P20), len(df_P20.Labels), len(df_P20.Labels.unique()))
print(len(df_D20), len(df_D20.Labels), len(df_D20.Labels.unique()))
print(len(df_E21), len(df_E21.Labels), len(df_E21.Labels.unique()))
print(len(df_P21), len(df_P21.Labels), len(df_P21.Labels.unique()))
print(len(df_N21), len(df_N21.Labels), len(df_N21.Labels.unique()))

2705
742 742 22
715 715 19
317 317 20
502 502 21
429 429 21


In [32]:
df_P20

Unnamed: 0,Labels,Paper Name,Author Names
0,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Inflecting When There's No Majority: Limitations of Encoder-Decoder Neural Networks as Cognitive Models for German Plurals,"Kate McCurdy, Sharon Goldwater, Adam Lopez"
1,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Learning to Understand Child-directed and Adult-directed Speech,"Lieke Gelderloos, Grzegorz Chrupała, Afra Alishahi"
2,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Recurrent Neural Network Language Models Always Learn English-Like Relative Clause Attachment,"Forrest Davis, Marten van Schijndel"
3,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",You Don't Have Time to Read This: An Exploration of Document Reading Time Prediction,"Orion Weller, Jordan Hildebrandt, Ilya Reznik, Christopher Challis, E. Shannon Tass, Quinn Snell, Kevin Seppi"
4,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Predicting Depression in Screening Interviews from Latent Categorization of Interview Prompts,"Alex Rinaldi, Jean Fox Tree, Snigdha Chaturvedi"
...,...,...,...
786,Theory and Formalism in NLP (Linguistic and Mathematical),Emergence of Syntax Needs Minimal Supervision,"Raphaël Bailly, Kata Gábor"
787,Theory and Formalism in NLP (Linguistic and Mathematical),A Three-Parameter Rank-Frequency Relation in Natural Languages,"Chenchen Ding, Masao Utiyama, Eiichiro Sumita"
788,Theory and Formalism in NLP (Linguistic and Mathematical),Language Models as an Alternative Evaluator of Word Order Hypotheses: A Case Study in Japanese,"Tatsuki Kuribayashi, Takumi Ito, Jun Suzuki, Kentaro Inui"
789,Theory and Formalism in NLP (Linguistic and Mathematical),Dice Loss for Data-imbalanced NLP Tasks,"Xiaoya Li, Xiaofei Sun, Yuxian Meng, Junjun Liang, Fei Wu, Jiwei Li"


In [14]:
df_whole = pd.read_csv("./Data/Collated_dataset_for_scientific_papers.csv")
df_whole = df_whole[df_whole['Labels'] != 'NLP Applications']
df_whole

Unnamed: 0,Labels,Paper Name,abstract
0,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Inflecting When There's No Majority: Limitations of Encoder-Decoder Neural Networks as Cognitive Models for German Plurals,"Can artificial neural networks learn to represent inflectional morphology and generalize to new words as human speakers do? Kirov and Cotterell (2018) argue that the answer is yes: modern Encoder-Decoder (ED) architectures learn human-like behavior when inflecting English verbs, such as extending the regular past tense form /-(e)d/ to novel words. However, their work does not address the criticism raised by Marcus et al. (1995) : that neural models may learn to extend not the regular, but the most frequent class -and thus fail on tasks like German number inflection, where infrequent suffixes like /-s/ can still be productively generalized. To investigate this question, we first collect a new dataset from German speakers (production and ratings of plural forms for novel nouns) that is designed to avoid sources of information unavailable to the ED model. The speaker data show high variability, and two suffixes evince 'regular' behavior, appearing more often with phonologically atypical inputs. Encoder-decoder models do generalize the most frequently produced plural class, but do not show human-like variability or 'regular' extension of these other plural markers. We conclude that modern neural models may still struggle with minority-class generalization."
1,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Learning to Understand Child-directed and Adult-directed Speech,"Speech directed to children differs from adultdirected speech in linguistic aspects such as repetition, word choice, and sentence length, as well as in aspects of the speech signal itself, such as prosodic and phonemic variation. Human language acquisition research indicates that child-directed speech helps language learners. This study explores the effect of child-directed speech when learning to extract semantic information from speech directly. We compare the task performance of models trained on adult-directed speech (ADS) and child-directed speech (CDS). We find indications that CDS helps in the initial stages of learning, but eventually, models trained on ADS reach comparable task performance, and generalize better. The results suggest that this is at least partially due to linguistic rather than acoustic properties of the two registers, as we see the same pattern when looking at models trained on acoustically comparable synthetic speech."
2,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Recurrent Neural Network Language Models Always Learn English-Like Relative Clause Attachment,"A standard approach to evaluating language models analyzes how models assign probabilities to valid versus invalid syntactic constructions (i.e. is a grammatical sentence more probable than an ungrammatical sentence). Our work uses ambiguous relative clause attachment to extend such evaluations to cases of multiple simultaneous valid interpretations, where stark grammaticality differences are absent. We compare model performance in English and Spanish to show that non-linguistic biases in RNN LMs advantageously overlap with syntactic structure in English but not Spanish. Thus, English models may appear to acquire human-like syntactic preferences, while models trained on Spanish fail to acquire comparable human-like preferences. We conclude by relating these results to broader concerns about the relationship between comprehension (i.e. typical language model use cases) and production (which generates the training data for language models), suggesting that necessary linguistic biases are not present in the training signal at all."
3,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",You Don't Have Time to Read This: An Exploration of Document Reading Time Prediction,"Predicting reading time has been a subject of much previous work, focusing on how different words affect human processing, measured by reading time. However, previous work has dealt with a limited number of participants as well as word level only predictions (i.e. predicting the time to read a single word). We seek to extend these works by examining whether or not document level predictions are effective, given additional information such as subject matter, font characteristics, and readability metrics. We perform a novel experiment to examine how different features of text contribute to the time it takes to read, distributing and collecting data from over a thousand participants. We then employ a large number of machine learning methods to predict a user's reading time. We find that despite extensive research showing that word level reading time can be most effectively predicted by neural networks, larger scale text can be easily and most accurately predicted by one factor, the number of words."
4,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Predicting Depression in Screening Interviews from Latent Categorization of Interview Prompts,"Despite the pervasiveness of clinical depression in modern society, professional help remains highly stigmatized, inaccessible, and expensive. Accurately diagnosing depression is difficult-requiring time-intensive interviews, assessments, and analysis. Hence, automated methods that can assess linguistic patterns in these interviews could help psychiatric professionals make faster, more informed decisions about diagnosis. We propose JLPC, a method that analyzes interview transcripts to identify depression while jointly categorizing interview prompts into latent categories. This latent categorization allows the model to identify high-level conversational contexts that influence patterns of language in depressed individuals. We show that the proposed model not only outperforms competitive baselines, but that its latent prompt categories provide psycholinguistic insights about depression."
...,...,...,...
2710,Summarization,QMSum: A New Benchmark for Query-based Multi-domain Meeting Summarization,"Meetings are a key component of human collaboration. As increasing numbers of meetings are recorded and transcribed, meeting summaries have become essential to remind those who may or may not have attended the meetings about the key decisions made and the tasks to be completed. However, it is hard to create a single short summary that covers all the content of a long meeting involving multiple people and topics. In order to satisfy the needs of different types of users, we define a new query-based multi-domain meeting summarization task, where models have to select and summarize relevant spans of meetings in response to a query, and we introduce QMSum, a new benchmark for this task. QMSum consists of 1,808 query-summary pairs over 232 meetings in multiple domains. Besides, we investigate a locate-then-summarize method and evaluate a set of strong summarization baselines on the task. Experimental results and manual analysis reveal that QMSum presents significant challenges in long meeting summarization for future research. Dataset is available at https://github.com/Yale-LILY/ QMSum."
2711,Summarization,MM-AVS: A Full-Scale Dataset for Multi-modal Summarization,"Multimodal summarization becomes increasingly significant as it is the basis for question answering, Web search, and many other downstream tasks. However, its learning materials have been lacking a holistic organization by integrating resources from various modalities, thereby lagging behind the research progress of this field. In this study, we present a full-scale multimodal dataset comprehensively gathering documents, summaries, images, captions, videos, audios, transcripts, and titles in English from CNN and Daily Mail. To our best knowledge, this is the first collection that spans all modalities and nearly comprises all types of materials available in this community. In addition, we devise a baseline model based on the novel dataset, which employs a newly proposed Jump-Attention mechanism based on transcripts. The experimental results validate the important assistance role of the external information for multimodal summarization."
2712,Summarization,MediaSum: A Large-scale Media Interview Dataset for Dialogue Summarization,"This paper introduces MEDIASUM 1 , a largescale media interview dataset consisting of 463.6K transcripts with abstractive summaries. To create this dataset, we collect interview transcripts from NPR and CNN and employ the overview and topic descriptions as summaries. Compared with existing public corpora for dialogue summarization, our dataset is an order of magnitude larger and contains complex multi-party conversations from multiple domains. We conduct statistical analysis to demonstrate the unique positional bias exhibited in the transcripts of televised and radioed interviews. We also show that MEDIASUM can be used in transfer learning to improve a model's performance on other dialogue summarization tasks. * Equal contribution 1 https://github.com/zcgzcgzcg1/ MediaSum/"
2713,Summarization,Improving Faithfulness in Abstractive Summarization with Contrast Candidate Generation and Selection,"Despite significant progress in neural abstractive summarization, recent studies have shown that the current models are prone to generating summaries that are unfaithful to the original context. To address the issue, we study contrast candidate generation and selection as a model-agnostic post-processing technique to correct the extrinsic hallucinations (i.e. information not present in the source text) in unfaithful summaries. We learn a discriminative correction model by generating alternative candidate summaries where named entities and quantities in the generated summary are replaced with ones with compatible semantic types from the source document. This model is then used to select the best candidate as the final output summary. Our experiments and analysis across a number of neural summarization systems show that our proposed method is effective in identifying and correcting extrinsic hallucinations. We analyze the typical hallucination phenomenon by different types of neural summarization systems, in hope to provide insights for future work on the direction."


In [15]:
df_whole.Labels.unique()

array(['Linguistic Theories, Cognitive Modeling and Psycholinguistics',
       'Computational Social Science and Social Media',
       'Dialogue and Interactive Systems', 'Discourse and Pragmatics',
       'Ethics and NLP', 'Generation', 'Information Extraction',
       'Information Retrieval and Text Mining',
       'Interpretability and Analysis of Models for NLP',
       'Language Grounding to Vision, Robotics and Beyond',
       'Machine Learning for NLP',
       'Machine Translation and Multilinguality',
       'Phonology, Morphology and Word Segmentation',
       'Question Answering', 'Resources and Evaluation',
       'Semantics: Lexical Semantics',
       'Semantics: Sentence-level Semantics, Textual Inference and Other areas',
       'Sentiment Analysis, Stylistic Analysis, and Argument Mining',
       'Speech and Multimodality', 'Summarization',
       'Syntax: Tagging, Chunking and Parsing',
       'Theory and Formalism in NLP (Linguistic and Mathematical)'],
      dtype=obj

### Keyword extraction

In [None]:
def keybert_keyword_extraction(input_text, kw_model, use_maxsum=True, use_mmr=False, ngram=3, topn=10, nr_cand=20, div=0.5):
    if use_maxsum:
        keywords_res = kw_model.extract_keywords(input_text, keyphrase_ngram_range=(1, ngram), stop_words='english',
                                        top_n=topn, use_maxsum=True, nr_candidates=nr_cand)
    elif use_mmr:
        keywords_res = kw_model.extract_keywords(input_text, keyphrase_ngram_range=(1, ngram), stop_words='english',
                                        top_n=topn, use_mmr=True, diversity=div)
    keyword_str = "#".join([kw[0] for kw in keywords_res])
    return keyword_str

In [None]:
def yake_keyword_extraction(input_text, kw_model, useless):
    keywords_res = kw_model.extract_keywords(input_text)
    keyword_str = "#".join([kw[0] for kw in keywords_res])
    return keyword_str

In [None]:
def format_str(keyword_str):
    keyword_str = keyword_str.replace(" ", "_")
    keyword_str = keyword_str.replace("#", " ")
    return keyword_str

#format_str(df_ACL_2020["Keyword"][0])

### Different setting of keyword extraction

In [None]:
# plm_name: roberta-base, allenai/scibert_scivocab_uncased, allenai/specter
plm_name = "allenai/scibert_scivocab_uncased"
plm = TransformerDocumentEmbeddings(plm_name)
kw_model = KeyBERT(model=plm)
    
use_maxsum = False
topn = 10
nr_cand = 20
use_mmr = True
ngram = 1
df_ACL_2020['Keyword'] = df_ACL_2020['Text'].progress_apply(keybert_keyword_extraction, args=(kw_model, use_maxsum, use_mmr, ngram, topn, nr_cand))

df_ACL_2020.to_csv("./Data/ACL_2020_keywords10_mmr_unigram_nostem_261222.csv", index = False)

In [None]:
# plm_name: roberta-base, allenai/scibert_scivocab_uncased, allenai/specter
plm_name = "allenai/specter"
plm = TransformerDocumentEmbeddings(plm_name)
kw_model = KeyBERT(model=plm)
    
use_maxsum = False
topn = 10
nr_cand = 20
use_mmr = True
ngram = 1
df_ACL_2020['Keyword'] = df_ACL_2020['Lemm Stemmed Text'].progress_apply(keybert_keyword_extraction, args=(kw_model, use_maxsum, use_mmr, ngram, topn, nr_cand))

df_ACL_2020.to_csv("./Data/ACL_2020_keywords10_mmr_unigram_specter_261222.csv", index = False)

In [None]:
dedup_func='seqm'
dedup_thred=0.7
ngram=2
wind_size=1
top_n=20
kw_model = yake.KeywordExtractor(n=ngram, dedupLim=dedup_thred, dedupFunc=dedup_func, windowsSize=wind_size, top=top_n)
useless=True

df_ACL_2020['Keyword'] = df_ACL_2020['Lemm Stemmed Text'].progress_apply(yake_keyword_extraction, args=(kw_model, useless))

df_ACL_2020.to_csv("./Data/ACL_2020_keywords_yake_030123.csv", index = False)

In [None]:
df_EMNLP_2020['Keyword'] = df_EMNLP_2020['Lemm Stemmed Text'].progress_apply(yake_keyword_extraction, args=(kw_model, useless))

### Keyword matching based topic classification

In [None]:
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_mmr_161222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_mmr_bigram_161222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_maxsum_231222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_maxsum_bigram_231222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_maxsum_unigram_251222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_maxsum_unigram_251222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_maxsum_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_maxsum_bigram_specter_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_bigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords20_mmr_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords20_mmr_bigram_specter_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords30_mmr_unigram_specter_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords40_mmr_unigram_specter_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords20_mmr_unigram_specter_div7_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_roberta_261222.csv")

#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_specter_nostem_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_nostem_261222.csv")
#df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords10_mmr_unigram_roberta_nostem_261222.csv")
df_ACL_load = pd.read_csv("./Data/ACL_2020_keywords_yake_030123.csv")

In [None]:
df_ACL_load

### Training on ACL2020

Obtain topic_keywords{topic: \[keywords\]}

In [None]:
df_train = df_ACL_load
text_label = "Keyword" # Extract keyword from which column, "Keyword" or "Lemm Stemmed Text", or "Text"
topk = 10 # 10, 20, 30, 40

In [None]:
# used for keyword only, convert from 'kc kc#kw2#kc kc3' to "kc_kc kw2 kc_kc3"
df_train["Keyword"] = df_train["Keyword"].apply(format_str)

In [None]:
topic_docs = df_train.groupby(['Label'], as_index=False).agg({text_label: ' '.join})

In [None]:
tfidf_model = TfidfVectorizer()
tfidf_vals = tfidf_model.fit_transform(topic_docs[text_label])
keyword_feas = tfidf_model.get_feature_names()

In [None]:
if text_label == "Keyword": 
    ngram_range=1
elif text_label == "Lemm Stemmed Text": 
    ngram_range=(1, 2)

count_vectorizer = CountVectorizer(ngram_range).fit(topic_docs[text_label])
count = count_vectorizer.transform(topic_docs[text_label])
words = count_vectorizer.get_feature_names()

In [None]:
class CTFIDFVectorizer(TfidfTransformer):
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=np.float64)
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF """
        X = X * self._idf_diag
        X = normalize(X, axis=1, norm='l1', copy=False)
        return X

In [None]:
# Extract top _topk_ words per class
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(df_train)).toarray()

topic_keywords = {topic_docs['Label'].iloc[label]: [words[index].replace("_", " ") for index in ctfidf[label].argsort()[-topk:]  if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}
topic_keywords_val = {topic_docs['Label'].iloc[label]: [(words[index].replace("_", " "), ctfidf[label][index]) for index in ctfidf[label].argsort()[-topk:] if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}

#topic_keywords = {topic_docs['Label'].iloc[label]: [lemmatiser_stemmer_stopword(words[index].replace("_", " "), nlp, stemmer) for index in ctfidf[label].argsort()[-topk:]  if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}
#topic_keywords_val = {topic_docs['Label'].iloc[label]: [(lemmatiser_stemmer_stopword(words[index].replace("_", " "), nlp, stemmer), ctfidf[label][index]) for index in ctfidf[label].argsort()[-topk:] if ctfidf[label][index]>0] for label in range(0,len(topic_docs['Label']))}



In [None]:
topic_keywords_val

### Testing on ACL2020/EMNLP2020

In [None]:
df_test = df_EMNLP_2020

In [None]:
def keyword_count(text): 

    keyword_count_dict = {}
    keyword_list_dict = {}

    for label in topic_keywords.keys():

        count = 0
        keywords = []

        list_of_key_words = topic_keywords[label]

        for keyword in list_of_key_words:

            count += text.count(keyword)
            if keyword in text: keywords.append(keyword)

        keyword_count_dict[label] = count
        keyword_list_dict[label] = keywords
    
    return keyword_list_dict, keyword_count_dict, max(keyword_count_dict, key=keyword_count_dict.get)

In [None]:
# count keywords from "Text" or "Lemm Stemmed Text" or "Keyword", according to which column to extract keyword
df_pred = df_test.apply(lambda row: keyword_count(row['Lemm Stemmed Text']), axis='columns', result_type='expand')
df_pred.columns = ["Matched Keywords", "Dictionary Output", "Predicted Label"]

df_test = pd.concat([df_test, df_pred], axis='columns')

In [None]:
df_test

In [None]:
df_test["Label Outcome"] = df_test.apply(lambda x: x["Label"] == x["Predicted Label"], axis = 1)
df_test_outcome = df_test[['Label', 'Label Outcome']].groupby(['Label', 'Label Outcome']).size().reset_index(name='Counts')
df_test_outcome = df_test_outcome.sort_values(by = ['Label','Label Outcome'], ascending = [True, False])

import plotly.express as px

fig = px.bar(df_test_outcome, x="Label", y="Counts", color="Label Outcome", title="Predictions for ACL Dataset",
             width=900, height=800)
fig.show()

In [None]:
idx_labels = list(df_test["Label"].unique())

cm = confusion_matrix(df_test['Label'], df_test['Predicted Label'], labels = idx_labels)
cm_df = pd.DataFrame(cm,
                     index = idx_labels, 
                     columns = idx_labels)

#Plotting the confusion matrix
plt.figure(figsize=(12,12))
sns.heatmap(cm_df, annot=True, cmap="Blues")
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
print(classification_report(df_test['Label'], df_test['Predicted Label']))

In [None]:
df_test_accuracy = df_test_outcome.pivot(index="Label", columns="Label Outcome", values="Counts").reset_index().fillna(0)
df_test_accuracy["Accuracy"] = df_test_accuracy[True] / (df_test_accuracy[False] + df_test_accuracy[True]) * 100
df_test_accuracy = df_test_accuracy.sort_values(by = 'Accuracy', axis=0, ascending=False)
df_test_accuracy