In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import random
import csv
import json 
from tqdm import tqdm

In [9]:
torch.backends.cudnn.deterministic = True
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)
np.random.seed(123)
torch.cuda.manual_seed_all(123)

In [4]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [10]:
df = pd.read_csv('QueryType.csv')

In [11]:
Embed = {}

for i in tqdm(range(len(df))):
# for i in range(1):
    df['Query'][i] = "[CLS] " + df['Query'][i] + " [SEP]"
    tokenized_text = tokenizer.tokenize(df['Query'][i])
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)

    indices = np.arange(1,len(tokenized_text)-1)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[0]
        
    token_embeddings = torch.squeeze(hidden_states, dim=0)
    
    embedding = np.zeros(token_embeddings[0].numpy().shape, dtype = np.float32)
    if len(indices) > 0:
        for j in indices:
            embedding += token_embeddings[j].numpy()
        embedding = embedding/len(indices)
    Embed[df['ID'][i]] = embedding.tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
100%|████████████████████████████████████████████████████████████████████████████| 48482/48482 [46:35<00:00, 17.34it/s]


In [13]:
Embed2 = {}
for key in Embed.keys():
    Embed2[int(key)]=Embed[key]

In [14]:
with open("querytype_dataset_sentence_embeddings.json", "w") as outfile:  
    json.dump(Embed2, outfile, indent = 4) 

In [15]:
query_types = set(df['Query Type'])

In [17]:
Embed = {}

for key in query_types:
    temp = "[CLS] " + key + " [SEP]"
    tokenized_text = tokenizer.tokenize(temp)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)

    indices = np.arange(1,len(tokenized_text)-1)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[0]
        
    token_embeddings = torch.squeeze(hidden_states, dim=0)
    
    embedding = np.zeros(token_embeddings[0].numpy().shape, dtype = np.float32)
    if len(indices) > 0:
        for j in indices:
            embedding += token_embeddings[j].numpy()
        embedding = embedding/len(indices)
    Embed[key] = embedding.tolist()

In [20]:
with open("querytype_keys_sentence_embeddings.json", "w") as outfile:  
    json.dump(Embed, outfile, indent = 4) 

In [22]:
Embed = {}
df = pd.read_csv('Sector.csv')

for i in tqdm(range(len(df))):
# for i in range(1):
    df['Query'][i] = "[CLS] " + df['Query'][i] + " [SEP]"
    tokenized_text = tokenizer.tokenize(df['Query'][i])
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)

    indices = np.arange(1,len(tokenized_text)-1)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[0]
        
    token_embeddings = torch.squeeze(hidden_states, dim=0)
    
    embedding = np.zeros(token_embeddings[0].numpy().shape, dtype = np.float32)
    if len(indices) > 0:
        for j in indices:
            embedding += token_embeddings[j].numpy()
        embedding = embedding/len(indices)
    Embed[int(df['ID'][i])] = embedding.tolist()
    
with open("sector_dataset_sentence_embeddings.json", "w") as outfile:  
    json.dump(Embed, outfile, indent = 4) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|████████████████████████████████████████████████████████████████████████████| 38701/38701 [36:30<00:00, 17.67it/s]


In [23]:
Embed = {}
sectors = set(df['Sector'])

for key in sectors:
    temp = "[CLS] " + key + " [SEP]"
    tokenized_text = tokenizer.tokenize(temp)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)

    indices = np.arange(1,len(tokenized_text)-1)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[0]
        
    token_embeddings = torch.squeeze(hidden_states, dim=0)
    
    embedding = np.zeros(token_embeddings[0].numpy().shape, dtype = np.float32)
    if len(indices) > 0:
        for j in indices:
            embedding += token_embeddings[j].numpy()
        embedding = embedding/len(indices)
    Embed[key] = embedding.tolist()
    
with open("sector_keys_sentence_embeddings.json", "w") as outfile:  
    json.dump(Embed, outfile, indent = 4) 

In [24]:
Embed = {}
df = pd.read_csv('Trimmed_data.csv')

# for i in tqdm(range(len(df))):
# # for i in range(1):
#     df['Query'][i] = "[CLS] " + df['Query'][i] + " [SEP]"
#     tokenized_text = tokenizer.tokenize(df['Query'][i])
#     indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
#     segments_ids = [1] * len(tokenized_text)

#     indices = np.arange(1,len(tokenized_text)-1)
#     tokens_tensor = torch.tensor([indexed_tokens])
#     segments_tensors = torch.tensor([segments_ids])
    
#     with torch.no_grad():
#         outputs = model(tokens_tensor, segments_tensors)
#         hidden_states = outputs[0]
        
#     token_embeddings = torch.squeeze(hidden_states, dim=0)
    
#     embedding = np.zeros(token_embeddings[0].numpy().shape, dtype = np.float32)
#     if len(indices) > 0:
#         for j in indices:
#             embedding += token_embeddings[j].numpy()
#         embedding = embedding/len(indices)
#     Embed[int(df['ID'][i])] = embedding.tolist()
    
# with open("sector_dataset_sentence_embeddings.json", "w") as outfile:  
#     json.dump(Embed, outfile, indent = 4) 

In [29]:
Embed = {}
df = pd.read_csv('Trimmed_data.csv')

for i in tqdm(range(400000)):
# for i in range(1):
    df['Query'][i] = "[CLS] " + df['Query'][i] + " [SEP]"
    tokenized_text = tokenizer.tokenize(df['Query'][i])
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)

    indices = np.arange(1,len(tokenized_text)-1)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[0]
        
    token_embeddings = torch.squeeze(hidden_states, dim=0)
    
#     embedding = np.zeros(token_embeddings[0].numpy().shape, dtype = np.float32)
    embedding = token_embeddings[0].numpy()
    Embed[int(df['ID'][i])] = embedding.tolist()
    
with open("sector_dataset_CLS_embeddings.json", "w") as outfile:  
    json.dump(Embed, outfile, indent = 4) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|███████████████████████████████████████████████████████████████████████| 400000/400000 [13:36:24<00:00,  8.17it/s]


In [36]:
df = pd.read_csv('Trimmed_data.csv')
df = df.loc[:399999]
df.to_csv('Dataset1.csv',index=False)