In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/citation_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/citation_sum


In [3]:
!pip3 install -q -U sentence-transformers

[K     |████████████████████████████████| 78 kB 5.8 MB/s 
[K     |████████████████████████████████| 3.1 MB 32.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 49.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 38.4 MB/s 
[K     |████████████████████████████████| 59 kB 8.6 MB/s 
[K     |████████████████████████████████| 895 kB 38.0 MB/s 
[K     |████████████████████████████████| 596 kB 56.5 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [4]:
import nltk
nltk.download('punkt')  
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
import os
from sklearn.cluster import KMeans
from sklearn import metrics
from collections import defaultdict
import json
from pprint import pprint
from sklearn.metrics import silhouette_score
import json
import warnings
warnings.filterwarnings("ignore")

### Load sentence BERT model

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Function to generate clusters of abstractive contexts corresponding to each cited id

In [19]:
def _generate_clusters_acc(abstractive_citation_contexts_lst, OUTPUT_PATH, cited_id_wo_txt, w_abstract=False):
  # All citation contexts will be embedded here in one go---the size of 'abstractive_context_embeddings_np' is same as
  # the number of abstractive citation contexts associated with a cited paper
  abstractive_context_embeddings_np = model.encode(abstractive_citation_contexts_lst)

  # Find the best value for k-number of clusters, iteratively
  min_clusters = 2
  max_clusters = len(abstractive_context_embeddings_np) - 1
  max_calinski_score = 0.0    # we want to maximize the calinski harabasz score
  max_silhouette_score = 0.0
  
  if max_clusters > min_clusters:
    for k in range(min_clusters, max_clusters):
      kmeans = KMeans(n_clusters=k, max_iter=100, init='random').fit(abstractive_context_embeddings_np)
      labels = kmeans.labels_
      silhouette_avg = silhouette_score(abstractive_context_embeddings_np, labels)
      if silhouette_avg > max_silhouette_score:
        max_silhouette_score = silhouette_avg
        K_max = k
  
    # Run K-means with k that has given the best calinski score
    kmeans = KMeans(n_clusters=K_max, max_iter=100, init='random').fit(abstractive_context_embeddings_np)
    labels = kmeans.labels_

    # Dump the abstractive contexts into their respective clusters after K-means has run and converged with the best 'K'
    dict_cluster_abstractive_contexts = defaultdict(list)
    for idx, cluster_id in enumerate(labels):
      #dict_cluster_abstractive_contexts[str(cluster_id)].append(' '.join(abstractive_contexts_list[idx]))
      dict_cluster_abstractive_contexts[str(cluster_id)].append(abstractive_citation_contexts_lst[idx])

  # write the defaultdict to a json file to the file system
  with open(f'{OUTPUT_PATH}/{cited_id_wo_txt}.json', 'w') as fp:
    json.dump(dict_cluster_abstractive_contexts, fp, indent=4)
  fp.close()


In [22]:
def main():
  w_rp_abstract = True  # changes based on whether rp_abstract
  if w_rp_abstract:
    ABSTRACTIVE_CONTEXT_PATH = "abstractive_citation_contexts_SciSummNet/ABSTRACTIVE_CITATION_CONTEXTS_FROM_CITATIONS_AND_RP_ABSTRACT"
    OUTPUT_PATH = f"CLUSTERING_SciSummNet_Abstractive_Citation_Contexts/CLUSTERS_FROM_CITATIONS_AND_RP_ABSTRACT"
    os.makedirs(OUTPUT_PATH, exist_ok=True)

  else:
    ABSTRACTIVE_CONTEXT_PATH = "abstractive_citation_contexts_SciSummNet/ABSTRACTIVE_CITATION_CONTEXTS_FROM_CITATIONS_ONLY"
    OUTPUT_PATH = f"CLUSTERING_SciSummNet_Abstractive_Citation_Contexts/CLUSTERS_FROM_CITATIONS_ONLY"
    os.makedirs(OUTPUT_PATH, exist_ok=True)

  for count, cited_id in enumerate(os.listdir(ABSTRACTIVE_CONTEXT_PATH)):
    if count % 100 == 0:
      print("Iteration: {}".format(count))
    with open(os.path.join(ABSTRACTIVE_CONTEXT_PATH, cited_id), 'r') as fp:
      abstractive_citation_contexts_lst = fp.read().splitlines()
    fp.close()

    cited_id_wo_txt = cited_id.replace('.txt', '')
    # For each cited id, there is multiple clusters to be generated---so, a new directory for a cited is important

    try:
      _generate_clusters_acc(abstractive_citation_contexts_lst, OUTPUT_PATH, cited_id_wo_txt, w_abstract=False)
    except:
      continue

In [23]:
if __name__ == "__main__":
  main()

Iteration: 0
Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900
Iteration: 1000
