In [1]:
#import all of the needed libraries
import re
import numpy as np
import pandas as pd
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
#import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

#important libraries for NLP
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from pytorch_transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from pytorch_transformers import AdamW
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')

from bert_serving.client import BertClient
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam
from tqdm import tqdm, trange
from nltk.corpus import stopwords
from gensim.models import Word2Vec
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
import gensim
from gensim import corpora

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\603766\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 

import os

import torch
import torch.optim as optim

import random 

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *

# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig


In [3]:
import fastai
import transformers
print('fastai version :', fastai.__version__)
print('transformers version :', transformers.__version__)

fastai version : 1.0.60
transformers version : 2.3.0


In [38]:
train = df['abstract']

In [12]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)
}


In [14]:
seed = 42
use_fp16 = False
bs = 16

model_type = 'roberta'
pretrained_model_name = 'roberta-base'

# model_type = 'bert'
# pretrained_model_name='bert-base-uncased'

# model_type = 'distilbert'
# pretrained_model_name = 'distilbert-base-uncased'

#model_type = 'xlm'
#pretrained_model_name = 'xlm-clm-enfr-1024'

# model_type = 'xlnet'
# pretrained_model_name = 'xlnet-base-cased'

In [15]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [16]:
model_class.pretrained_model_archive_map.keys()

dict_keys(['roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector'])

In [17]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False


In [18]:
seed_all(seed)

In [19]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
            tokens = [CLS] + tokens + [SEP]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
            if self.model_type in ['xlnet']:
                tokens = tokens + [SEP] +  [CLS]
            else:
                tokens = [CLS] + tokens + [SEP]
        return tokens


In [20]:
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

In [21]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)
    
    def __getstate__(self):
        return {'itos':self.itos, 'tokenizer':self.tokenizer}

    def __setstate__(self, state:dict):
        self.itos = state['itos']
        self.tokenizer = state['tokenizer']
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})

In [22]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)

tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

In [23]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

In [24]:
tokens = transformer_tokenizer.tokenize('Salut c est moi, Hello it s me')
print(tokens)
ids = transformer_tokenizer.convert_tokens_to_ids(tokens)
print(ids)
transformer_tokenizer.convert_ids_to_tokens(ids)

['Sal', 'ut', 'Ġc', 'Ġest', 'Ġmo', 'i', ',', 'ĠHello', 'Ġit', 'Ġs', 'Ġme']
[18111, 1182, 740, 3304, 7458, 118, 6, 20920, 24, 579, 162]


['Sal', 'ut', 'Ġc', 'Ġest', 'Ġmo', 'i', ',', 'ĠHello', 'Ġit', 'Ġs', 'Ġme']

In [40]:
print('[CLS] token :', transformer_tokenizer.cls_token)
print('[SEP] token :', transformer_tokenizer.sep_token)
print('[PAD] token :', transformer_tokenizer.pad_token)
#databunch.show_batch()

[CLS] token : <s>
[SEP] token : </s>
[PAD] token : <pad>


In [None]:
print('[CLS] id :', transformer_tokenizer.cls_token_id)
print('[SEP] id :', transformer_tokenizer.sep_token_id)
print('[PAD] id :', pad_idx)
test_one_batch = databunch.one_batch()[0]
print('Batch shape : ',test_one_batch.shape)
print(test_one_batch)

In [26]:
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel,self).__init__()
        self.transformer = transformer_model
        
    def forward(self, input_ids, attention_mask=None):
        
        # attention_mask
        # Mask to avoid performing attention on padding token indices.
        # Mask values selected in ``[0, 1]``:
        # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        attention_mask = (input_ids!=pad_idx).type(input_ids.type()) 
        
        logits = self.transformer(input_ids,
                                  attention_mask = attention_mask)[0]   
        return logits

In [41]:
config = config_class.from_pretrained(pretrained_model_name)
config.num_labels = 5
config.use_bfloat16 = use_fp16
print(config)

{
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 5,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "use_bfloat16": false,
  "vocab_size": 50265
}



In [42]:
transformer_model = model_class.from_pretrained(pretrained_model_name, config = config)
# transformer_model = model_class.from_pretrained(pretrained_model_name, num_labels = 5)

custom_transformer_model = CustomTransformerModel(transformer_model = transformer_model)

In [50]:
from fastai.callbacks import *
from transformers import AdamW
from functools import partial

CustomAdamW = partial(AdamW, correct_bias=False)

learner = Learner(databunch, 
                  custom_transformer_model, 
                  opt_func = CustomAdamW, 
                  metrics=[accuracy, error_rate])

# Show graph of learner stats and metrics after each epoch.
learner.callbacks.append(ShowGraph(learner))

# Put learn in FP16 precision mode. --> Seems to not working
if use_fp16: learner = learner.to_fp16()

NameError: name 'databunch' is not defined

In [None]:
# For roberta-base
list_layers = [learner.model.transformer.roberta.embeddings,
              learner.model.transformer.roberta.encoder.layer[0],
              learner.model.transformer.roberta.encoder.layer[1],
              learner.model.transformer.roberta.encoder.layer[2],
              learner.model.transformer.roberta.encoder.layer[3],
              learner.model.transformer.roberta.encoder.layer[4],
              learner.model.transformer.roberta.encoder.layer[5],
              learner.model.transformer.roberta.encoder.layer[6],
              learner.model.transformer.roberta.encoder.layer[7],
              learner.model.transformer.roberta.encoder.layer[8],
              learner.model.transformer.roberta.encoder.layer[9],
              learner.model.transformer.roberta.encoder.layer[10],
              learner.model.transformer.roberta.encoder.layer[11],
              learner.model.transformer.roberta.pooler]

In [None]:
# For xlnet-base-cased
# list_layers = [learner.model.transformer.transformer.word_embedding,
#               learner.model.transformer.transformer.layer[0],
#               learner.model.transformer.transformer.layer[1],
#               learner.model.transformer.transformer.layer[2],
#               learner.model.transformer.transformer.layer[3],
#               learner.model.transformer.transformer.layer[4],
#               learner.model.transformer.transformer.layer[5],
#               learner.model.transformer.transformer.layer[6],
#               learner.model.transformer.transformer.layer[7],
#               learner.model.transformer.transformer.layer[8],
#               learner.model.transformer.transformer.layer[9],
#               learner.model.transformer.transformer.layer[10],
#               learner.model.transformer.transformer.layer[11],
#               learner.model.transformer.sequence_summary]

In [45]:
# For roberta-base
list_layers = [learner.model.transformer.roberta.embeddings,
              learner.model.transformer.roberta.encoder.layer[0],
              learner.model.transformer.roberta.encoder.layer[1],
              learner.model.transformer.roberta.encoder.layer[2],
              learner.model.transformer.roberta.encoder.layer[3],
              learner.model.transformer.roberta.encoder.layer[4],
              learner.model.transformer.roberta.encoder.layer[5],
              learner.model.transformer.roberta.encoder.layer[6],
              learner.model.transformer.roberta.encoder.layer[7],
              learner.model.transformer.roberta.encoder.layer[8],
              learner.model.transformer.roberta.encoder.layer[9],
              learner.model.transformer.roberta.encoder.layer[10],
              learner.model.transformer.roberta.encoder.layer[11],
              learner.model.transformer.roberta.pooler]

NameError: name 'learner' is not defined

In [46]:
learner.split(list_layers)
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')
print(learner.layer_groups)

NameError: name 'learner' is not defined

In [None]:
learner.split(list_layers)
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')
print(learner.layer_groups)

In [54]:
from summarizer import Summarizer,TransformerSummarizer

ModuleNotFoundError: No module named 'summarizer'

In [1]:
from summarizer import Summarizer,TransformerSummarizer

ModuleNotFoundError: No module named 'summarizer'

In [24]:
#usage of NER
from transformers import XLNetTokenizer, XLNetForTokenClassification
import torch

tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')

input_ids = torch.tensor(tokenizer.encode("hail to")).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)

scores = outputs[0]

In [25]:
outputs

(tensor(1.3049, grad_fn=<NllLossBackward>), tensor([[[5.1481, 3.9959],
          [5.2125, 4.3895],
          [5.5018, 4.4993],
          [5.1607, 4.1914]]], grad_fn=<AddBackward0>))

In [5]:
df=pd.read_csv('../data/metadata.csv')

In [6]:
df.shape

(45774, 17)

In [7]:
df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,Samuel Minot; Stephen D Turner; Krista L Ternu...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/001727
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,Lin WANG; Xiang Li,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/003889
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/006866
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,Sofia Morfopoulou; Vincent Plagnol,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/007476
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,Stephen P Velsko; Jonathan E Allen,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/010389


In [8]:
#data cleaning dedup on abstract
#how orignial the abstract is
#based on the references and resources the author created the 
#how similar the paper is to the connecting points
#originality score

In [9]:
abstracts=df['abstract']

In [10]:
#Clean the documents
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop1 = set(stopwords.words('english'))
stop2 = set(stopwords.words('spanish'))
stop3 = set(stopwords.words('french'))
stop=[stop1,stop2,stop3]
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
str_abstract=[str(abstract) for abstract in df['abstract']]
abstract_array =[clean(abstract).split() for abstract in str_abstract]


In [11]:
#join the cleaned abstracts
abstract_array_joined =[" ".join(clean(abstract).split()) for abstract in str_abstract]


In [12]:
abstract_array_shortened=abstract_array_joined[1:10]
abstract_array_array=abstract_array[1:10]

In [16]:
from transformers import XLNetTokenizer, XLNetModel
import torch

tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetModel.from_pretrained('xlnet-large-cased')



In [19]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0)  # Batch size 1

outputs = model(input_ids)
last_hidden_states = outputs[0]

In [21]:
model.get_input_embeddings()

Embedding(32000, 1024)

In [22]:
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch

tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')



In [81]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import torch

tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')



In [99]:
input_ids = torch.tensor(tokenizer.encode("Hello.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]

In [100]:
input_ids

tensor([[   17, 11368,     9,     4,     3]])

In [101]:
outputs

(tensor(1.3576, grad_fn=<NllLossBackward>),
 tensor([[ 0.3827, -0.6775]], grad_fn=<AddmmBackward>))

In [116]:
from transformers import XLNetTokenizer, XLNetForMultipleChoice
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')


In [4]:
from transformers import XLNetTokenizer, XLNetForMultipleChoice
import torch

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')



In [5]:
choices = ["Hello, my dog is cute", "Hello, my cat is amazing","Hello, my bat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
labels = torch.tensor(2).unsqueeze(0)  # Batch size 1

outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]

In [6]:
outputs

(tensor(1.1106, grad_fn=<NllLossBackward>),
 tensor([[0.6924, 0.6095, 0.6339]], grad_fn=<ViewBackward>))

In [227]:
choices = ["Hello, my dog is cute", "Hello, my cat is amazing","p"]
maxlen = len(max(choices, key=len))
choices_padded=[line.rjust(maxlen, '0') for line in choices]
toks=[tokenizer.encode(s) for s in choices]

In [224]:
toks

[[17, 11368, 19, 94, 2288, 27, 10920, 4, 3],
 [17, 11368, 19, 94, 4777, 27, 3704, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [943, 4, 3]]

In [233]:
def pad(l, content, width):
    l.extend([content] * (width - len(l)))
    return l
p=[pad(i,0,10) for i in toks]
t=torch.tensor(p).unsqueeze(0)

In [234]:
outputs = model(t, labels=labels)

In [235]:
outputs

(tensor(1.1941, grad_fn=<NllLossBackward>),
 tensor([[-0.7771, -0.8818, -0.7075]], grad_fn=<ViewBackward>))

In [187]:

tok_padded=[]
max_len=20
for i in tokens:
    if len(i)<max_len:
        i.append(np.zeros(max_len-len(i)))
    tok_padded.append(i)

In [188]:
tok_padded

[[17,
  11368,
  19,
  94,
  2288,
  27,
  10920,
  4,
  3,
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])],
 [17,
  11368,
  19,
  94,
  4777,
  27,
  3704,
  4,
  3,
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])],
 [35,
  564,
  94,
  17,
  299,
  722,
  4,
  3,
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])],
 [35,
  564,
  94,
  17,
  299,
  722,
  4,
  3,
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]]

In [176]:
tokens

array([[   17, 11368,    19,    94,  2288,    27, 10920,     4,     3,     0],
       [   17, 11368,    19,    94,  4777,    27,  3704,     4,     3,     0],
       [   35,   564,    94,    17,   299,   722,     4,     3,     0,     0],
       [   35,   564,    94,    17,   299,   722,     4,     3,     0,     0]])

In [163]:
classification_scores

tensor([[0.5286, 0.5334, 0.4525]], grad_fn=<ViewBackward>)

In [105]:
from transformers import XLNetTokenizer, XLNetForQuestionAnswering
import torch

tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')



In [161]:
input_ids = torch.tensor(tokenizer.encode("Hello", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]

TypeError: forward() got an unexpected keyword argument 'start_positions'

In [110]:
outputs

(tensor(1.6202, grad_fn=<DivBackward0>),)

In [None]:
# Pad, and use classification scores to score the sentences for each abstract
#use these scored sentence to group them using the DB connect method
# you can test this solution in groups and for the abstracts as a whole

In [80]:
input_ids

tensor([[[   17, 11368,    19,    94,  2288,    27, 10920,     4,     3],
         [   17, 11368,    19,    94,  4777,    27,  3704,     4,     3],
         [   17, 11368,    19,    94,  6842,    27, 10920,     4,     3]]])

In [68]:
classification_scores

tensor([[-0.1728,  0.0180,  0.1194]], grad_fn=<ViewBackward>)

In [25]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0)  # Batch size 1

outputs = model(input_ids)
last_hidden_states = outputs[0]

In [26]:
outputs

(tensor([[[-58.3121, -74.4120, -74.9397,  ..., -74.4455, -72.3030, -69.8531],
          [-58.4822, -74.5033, -75.1071,  ..., -74.4611, -72.2686, -69.8249]]],
        grad_fn=<AddBackward0>),)

In [18]:
input_ids

tensor([[   17, 11368,    19,    94,  2288,    27,   172,     6]])

In [12]:
p=outputs[0].detach().numpy()

In [10]:
outputs

(tensor(7.0585, grad_fn=<NllLossBackward>),
 tensor([[[-33.0345, -42.5843, -42.9639,  ..., -38.2921, -41.6924, -38.1061]]],
        grad_fn=<AddBackward0>))

In [28]:
input_ids

tensor([[   17, 11368]])

In [132]:
#usage of NER
from transformers import XLNetTokenizer, XLNetForTokenClassification
import torch

tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')



In [None]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)

scores = outputs[0]

In [17]:
import pickle as pkl


In [None]:
file=open("xlnettokenizer.pkl","wb")
pkl.dump(tokenizer,file)
file.close()

file2=open("xlnetmodel.pkl","wb")
pkl.dump(model,file2)
file2.close()


In [18]:
file = open('xlnettokenizer.pkl', 'rb')

token = pickle.load(file)

file.close()

file2 = open('xlnetmodel.pkl', 'rb')

modelf = pickle.load(file2)

file2.close()

In [19]:
output=[]
score=[]
for abstract in abstract_array_shortened:
    input_ids = torch.tensor(tokenizer.encode(abstract, add_special_tokens=False)).unsqueeze(0)  # Batch size 1
    outputs = modelf(input_ids)
    last_hidden_states = outputs[0]
    output.append(outputs)
    scores = outputs[0]
    score.append(scores)

In [20]:
arrayed_data=[]
for o in output:
    arrayed_data.append(o[0].detach().numpy())

In [21]:
tuple_data=[]
multi_array=[]
sub_array=[]
for i in arrayed_data:
    for j in i:
        for k in j:
            for l in k:
                sub_array.append(l)
            multi_array.append(sub_array)

In [98]:
id2word = corpora.Dictionary(abstract_array_array)


In [104]:
print(id2word)

Dictionary(699 unique tokens: ['a', 'adapting', 'ah1n1', 'ah7n9', 'already']...)


In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForTokenClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
outputs = model(input_ids)
scores = outputs[0]

In [115]:
sentences = abstract_array
new_sent=[]
for i in sentences:
    new_sent.append(str(i))
sentences = ["[SEP]"+sentence +"[CLS]" for sentence in new_sent]

In [116]:
#research XLNet a little more, make sure implemntation is correct
tokenizer_2 = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
tokenized_texts_2 = [tokenizer_2.tokenize(sent) for sent in sentences]
print(tokenized_texts_2[0])

KeyboardInterrupt: 

In [None]:
input_ids_2 = [tokenizer_2.convert_tokens_to_ids(x) for x in tokenized_texts_2]

In [None]:
MAX_LEN = 150

In [None]:
input_ids_2 = pad_sequences(input_ids_2, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
print(input_ids_2[2])

In [None]:
xp = []
xp.append(ids)
input_ids = np.asarray(xp)
xlnet_model = xlnet.XLNetModel(
    xlnet_config=xlnet_config,
    run_config=run_config,
    input_ids=input_ids_2,
    seg_ids=None,
    input_mask=None)
embed1=tf.train.load_variable('../data/xlnet_cased_L-24_H-1024_A-16/xlnet_model.ckpt','model/transformer/word_embedding/lookup_table:0')

In [None]:
# Dimensionality reduction PCA UMAP TSNE 

In [None]:
#simple kmeans clustering 
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(multi_array)


In [15]:
len(kmeans.labels_)

45774

In [16]:
# hierarchal clustering 
from sklearn.cluster import AgglomerativeClustering
agg=AgglomerativeClustering(n_clusters=10).fit(input_ids_2)

In [17]:
agg.labels_

array([1, 0, 3, ..., 4, 7, 1], dtype=int64)

In [18]:
#Clustering algorithm 3
from sklearn.cluster import DBSCAN
db=DBSCAN(eps=3).fit(input_ids_2)


In [None]:
#Gaussian Mixture Modeling 

In [19]:
db.labels_

array([-1, -1, -1, ..., -1,  0, -1], dtype=int64)

In [20]:
#create a new dataframe
d = {'id':df['cord_uid'] , 'cluster': agg.labels_,'abstract':abstract_array_joined}
clusters=pd.DataFrame(data=d)

In [21]:
clusters.head()

Unnamed: 0,id,cluster,abstract
0,vho70jcx,1,nextgeneration sequencing is increasingly bein...
1,i9tbix2v,0,an emerging disease is one infectious epidemic...
2,62gfisc6,3,germline variation at immunoglobulin gene ig l...
3,058r9486,0,deep sequencing of clinical sample is now an e...
4,wich35l7,3,developing method to reconstruct transmission ...


In [22]:
gf=clusters.groupby('cluster').count()

In [23]:
gf
#try a double dbscan, dbscan once, then dbscan the first cluster

Unnamed: 0_level_0,id,abstract
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10660,10660
1,3181,3181
2,1189,1189
3,8353,8353
4,5915,5915
5,2970,2970
6,943,943
7,8199,8199
8,1716,1716
9,2648,2648


In [24]:
cluster1=clusters[clusters['cluster']==-1].dropna()
cluster2=clusters[clusters['cluster']==0].dropna()
cluster3=clusters[clusters['cluster']==1].dropna()
cluster4=clusters[clusters['cluster']==2].dropna()
cluster5=clusters[clusters['cluster']==3].dropna()

In [25]:
#seperate the clusters into their own dataframes
cluster1=clusters[clusters['cluster']==0].dropna()
cluster2=clusters[clusters['cluster']==1].dropna()
cluster3=clusters[clusters['cluster']==2].dropna()
cluster4=clusters[clusters['cluster']==3].dropna()
cluster5=clusters[clusters['cluster']==4].dropna()
cluster6=clusters[clusters['cluster']==5].dropna()
cluster7=clusters[clusters['cluster']==6].dropna()
cluster8=clusters[clusters['cluster']==7].dropna()
cluster9=clusters[clusters['cluster']==8].dropna()
cluster10=clusters[clusters['cluster']==9].dropna()

In [34]:
#find the frequency of topics 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(cluster7['abstract'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(cluster7['abstract'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [35]:
#topic model
#always more types of topic modeling Latent Discriminate Analysis 
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation( max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [36]:
#display topics
#look up methods for word patterns and frequncy analysis 
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

no_top_words = 10
print_top_words(nmf, tfidf_feature_names, no_top_words)
print_top_words(lda, tf_feature_names, no_top_words)

Topic #0: respiratory child sample infection wa assay virus detected pathogen detection
Topic #1: rna sequence genome viral gene virus replication region dna coronavirus
Topic #2: health disease outbreak infectious public infection risk case ha data
Topic #3: cell mouse infection expression immune receptor viral response cytokine virus
Topic #4: protein membrane domain fusion peptide activity lipid structure cellular binding
Topic #5: antibody monoclonal neutralizing epitope serum igg antigen assay produced used
Topic #6: virus influenza human h5n1 avian h1n1 pathogenic acid pandemic detection
Topic #7: sars sarscov 2003 severe syndrome cov acute respiratory woman kong
Topic #8: vaccine immune vaccination response safety development antigen hajj efficacy strategy
Topic #9: patient group pneumonia day wa covid19 treatment case symptom hospital

Topic #0: sars le la et cov en unit agent 2003 epidemic
Topic #1: vaccine disease immune response study vaccination milk review abstract ha
Topi