In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/citation_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/citation_sum


In [None]:
!pip3 install -q -U sentence-transformers

In [None]:
import nltk
nltk.download('punkt')  
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
import os
from sklearn.cluster import KMeans
from sklearn import metrics
from collections import defaultdict
import json
from pprint import pprint
from sklearn.metrics import silhouette_score
import json
import warnings
warnings.filterwarnings("ignore")

### Load sentence BERT model

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

## Function to generate clusters of abstractive contexts corresponding to each cited id

In [None]:
def _generate_clusters_acc(abstractive_citation_contexts_lst, OUTPUT_PATH, cited_id_wo_txt, w_abstract=False):
  # All citation contexts will be embedded here in one go---the size of 'abstractive_context_embeddings_np' is same as
  # the number of abstractive citation contexts associated with a cited paper
  abstractive_context_embeddings_np = model.encode(abstractive_citation_contexts_lst)

  # Find the best value for k-number of clusters, iteratively
  min_clusters = 2
  max_clusters = len(abstractive_context_embeddings_np) - 1
  max_calinski_score = 0.0    # we want to maximize the calinski harabasz score
  max_silhouette_score = 0.0
  
  if max_clusters > min_clusters:
    for k in range(min_clusters, max_clusters):
      kmeans = KMeans(n_clusters=k, max_iter=10, init='random').fit(abstractive_context_embeddings_np)
      labels = kmeans.labels_
      silhouette_avg = silhouette_score(abstractive_context_embeddings_np, labels)
      if silhouette_avg > max_silhouette_score:
        max_silhouette_score = silhouette_avg
        K_max = k
  
    # Run K-means with k that has given the best calinski score
    kmeans = KMeans(n_clusters=K_max, max_iter=10, init='random').fit(abstractive_context_embeddings_np)
    labels = kmeans.labels_

    # Dump the abstractive contexts into their respective clusters after K-means has run and converged with the best 'K'
    dict_cluster_abstractive_contexts = defaultdict(list)
    for idx, cluster_id in enumerate(labels):
      #dict_cluster_abstractive_contexts[str(cluster_id)].append(' '.join(abstractive_contexts_list[idx]))
      dict_cluster_abstractive_contexts[str(cluster_id)].append(abstractive_citation_contexts_lst[idx])

  # write the defaultdict to a json file to the file system
  with open(f'{OUTPUT_PATH}/{cited_id_wo_txt}.json', 'w') as fp:
    json.dump(dict_cluster_abstractive_contexts, fp, indent=4)
  fp.close()


In [None]:
def main():
  w_rp_abstract = False  # changes based on whether rp_abstract

  if w_rp_abstract:
    ABSTRACTIVE_CONTEXT_PATH = "abstractive_citation_contexts_MAGSumm/ABSTRACTIVE_CITATION_CONTEXTS_FROM_CITATIONS_AND_RP_ABSTRACT"
    OUTPUT_PATH = f"CLUSTERING_MAGSumm_Abstractive_Citation_Contexts/CLUSTERS_FROM_CITATIONS_AND_RP_ABSTRACT"
    os.makedirs(OUTPUT_PATH, exist_ok=True)

  else:
    ABSTRACTIVE_CONTEXT_PATH = "abstractive_citation_contexts_MAGSumm/ABSTRACTIVE_CITATION_CONTEXTS_FROM_CITATIONS_ONLY"
    OUTPUT_PATH = f"CLUSTERING_MAGSumm_Abstractive_Citation_Contexts/CLUSTERS_FROM_CITATIONS_ONLY"
    os.makedirs(OUTPUT_PATH, exist_ok=True)

  for count, cited_id in enumerate(os.listdir(ABSTRACTIVE_CONTEXT_PATH)):
    if count % 100 == 0:
      print("Iteration: {}".format(count))
    with open(os.path.join(ABSTRACTIVE_CONTEXT_PATH, cited_id), 'r') as fp:
      abstractive_citation_contexts_lst = list(set(fp.read().splitlines()))
    fp.close()

    cited_id_wo_txt = cited_id.replace('.txt', '')
    # For each cited id, there is multiple clusters to be generated---so, a new directory for a cited is important

    try:
      _generate_clusters_acc(abstractive_citation_contexts_lst, OUTPUT_PATH, cited_id_wo_txt, w_abstract=False)
    except:
      continue

In [None]:
if __name__ == "__main__":
  main()

Iteration: 0
