In [1]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
     |████████████████████████████████| 306 kB 377 kB/s            
Collecting pyarrow!=4.0.0,>=3.0.0
  Downloading pyarrow-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.6 MB)
     |████████████████████████████████| 25.6 MB 75 kB/s             
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
     |████████████████████████████████| 132 kB 836 kB/s            
Collecting multiprocess
  Downloading multiprocess-0.70.12.2-py39-none-any.whl (128 kB)
     |████████████████████████████████| 128 kB 1.2 MB/s            
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp39-cp39-manylinux2010_x86_64.whl (243 kB)
     |████████████████████████████████| 243 kB 1.5 MB/s            
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.2 MB)
     |███████████████

In [None]:
# from datasets import load_dataset
# from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModel

## Loading The Dataset

In [None]:
import pandas as pd
cols = ["sentence1", "sentence2","labels","similarity"]
data = pd.read_csv("actual_train.csv", names = cols)
data.to_csv("actual_train.csv")

In [None]:
data["sentence2"][1]

'play song or music. Whatever you say'

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="actual_train.csv")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'sentence1', 'sentence2', 'labels', 'similarity'],
        num_rows: 156
    })
})

## Tokenization

In [None]:
checkpoint = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def tokenize_function(example):
    return tokenizer(example['sentence1'],example["sentence2"],padding = True, truncation=True, return_tensors='pt')

In [None]:
tokenized_datasets = dataset.map(tokenize_function)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-2eb4d8d6bd350683/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-c8773499dca0c249.arrow


In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2","Unnamed: 0","labels","similarity"])

In [None]:
# tokenized_datasets = tokenized_datasets.rename_column("similarity", "labels")

In [None]:
tokenized_datasets.set_format("torch")

In [None]:
tokenized_datasets["train"].column_names

['input_ids', 'token_type_ids', 'attention_mask']

## Training

In [None]:
# Let's define our data loaders
from torch.utils.data import DataLoader
train_data_loader =  DataLoader(tokenized_datasets["train"], shuffle = True, batch_size = 8, collate_fn = data_collator)
# eval_data_loader = DataLoader(tokenized_datasets["validation"], batch_size = 8, collate_fn = data_collator)

In [None]:
type(train_data_loader)

torch.utils.data.dataloader.DataLoader

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [None]:
# Let's check our code
for batch in train_data_loader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
# Loading the model
from transformers import AutoModelForSequenceClassification

model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr = 5e-5)

In [None]:
# learning rate scheduler 
# Here we need to know the number of training steps we will take, which is the number of epochs we want to run multiplied by the number of training batches (which is the length of our training dataloader).
from transformers import get_scheduler
num_epochs = 10
num_training_steps = num_epochs * len(train_data_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

print(num_training_steps)

60


In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(250037, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_data_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

In [None]:
# from datasets import load_metric

# metric = load_metric("glue", "mrpc")
# model.eval()

# for batch in eval_data_loader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)

#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])

# metric.compute()

## Test Area

In [1]:
from transformers import AutoTokenizer,AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

2022-05-10 20:15:05.868877: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-10 20:15:05.868910: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

In [3]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [5]:
# cols = ["sentences","labels",'encodings']
# data = pd.read_csv("Command.csv")
# data.to_pickle("commands_V2.pkl")

In [4]:
import pandas as pd
cols = ["sentences","labels",'encodings']
data = pd.read_pickle("commands_V2.pkl")
# data.to_csv("actual_train.csv")

In [5]:
# data = data.drop("Unnamed: 0", axis = 1)
# data.head()
# data.to_pickle("commands_V2.pkl")

In [6]:
def get_encoding(sentence):
  encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  with torch.no_grad():
      model_output = model(**encoded_input)
  # Return encoding
  encoding_output =  mean_pooling(model_output, encoded_input['attention_mask'])
  encoding_output = encoding_output.reshape(1, -1)
  return encoding_output

In [12]:
data['labels'].unique

<bound method Series.unique of 0       music
1       music
2       music
3       music
4       music
        ...  
101    reboot
102      date
103      date
104      date
105     intro
Name: labels, Length: 106, dtype: object>

**Add new sentence here**

In [7]:
def add_new_sen(sen,data):
    enc = get_encoding(sen)
    label = input("Enter the label: ")
#     df2 = {'sentences': sen, 'labels': label, 'encodings': enc}
    df2 = pd.DataFrame({'sentences': [sen], 'labels': [label], 'encodings': [enc]})
    data = pd.concat([data,df2], ignore_index = True, axis = 0)
    return data

In [14]:
while(1):
    sen = input("Enter a sentence: ")
    if sen == "done":
        break
    data = add_new_sen(sen,data)
    data.to_pickle("commands_V2.pkl")
    

Enter a sentence: Launch Firefox browser
Enter the label: firefox
Enter a sentence: bring out my firefox
Enter the label: firefox
Enter a sentence: I need firefox
Enter the label: firefox
Enter a sentence: Launch a browser
Enter the label: browser
Enter a sentence: I need a browser
Enter the label: browser
Enter a sentence: done


In [27]:
data.to_csv("Commmand.csv")

In [None]:
# import pandas as pd
# cols = ["sentences","labels",'encodings']
# data = pd.read_pickle("commands_V2.pkl")
# # data.to_csv("actual_train.csv")

In [8]:
def similarity_score(sentence):
  encoded_output = get_encoding(sentence)
  scores = []
  min_score = 0.68
  for i in range(len(data)):
    score = cosine_similarity(data["encodings"][i], encoded_output)
    scores.append(score)
    command_idx = np.argmax(scores)
    max_score = np.max(scores)
    
#     if max_score < min_score:
#         label = "web search"
#     else:
    label = data["labels"][command_idx]
        
  return command_idx,label,max_score

In [15]:
similarity_score("Launch firefox")

(143, 'firefox', 0.8275683)

Ohh yeh.....😱😱😱😱<br>
The Primordial dude you are Great.....

In [54]:
data.to_pickle("commands.pkl")

In [None]:
data["sentences"][8]

## Test Dummies

In [None]:
# sentences = [data["sentence1"][27], data["sentence1"][30]]
# sentences

['open skype', 'open google meet']

In [53]:
import pandas as pd
cols = ["sentences","labels", 'temp','encodings']
data = pd.read_csv("commands(Product).csv", names=cols)
data = data.drop("temp", axis = 1)
data.head()

Unnamed: 0,sentences,labels,encodings
0,open music,music,D
1,paly music for me,music,
2,can you play anything? my friends are getting ...,music,
3,choose a song from library and play it,music,
4,"i am bored,can you play a song for me",music,


In [21]:
data["sentences"][16]

'launch settings'

In [22]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

NameError: name 'sentences' is not defined

In [None]:

# Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input)

# # Perform pooling. In this case, max pooling.
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])


In [None]:
# print("Sentence embeddings:")
# print(sentence_embeddings[0])

In [None]:
# v1 = sentence_embeddings[0].reshape(1, -1)
# v2 = sentence_embeddings[1].reshape(1, -1)
# cosine_similarity(v1, v2)

In [16]:
# encoding = get_encoding(data["sentences"][1])
lst = []
for i in range(len(data)):
  encoding = get_encoding(data["sentences"][i])
  print(f"done{i}")
  lst.append(encoding)


done0
done1
done2
done3
done4
done5
done6
done7
done8
done9
done10
done11
done12
done13
done14
done15
done16
done17
done18
done19
done20
done21
done22
done23
done24
done25
done26
done27
done28
done29
done30
done31
done32
done33
done34
done35
done36
done37
done38
done39
done40
done41
done42
done43
done44
done45
done46
done47
done48
done49
done50
done51
done52
done53
done54
done55
done56
done57
done58
done59
done60
done61
done62
done63
done64
done65
done66
done67
done68
done69
done70
done71
done72
done73
done74
done75
done76
done77
done78
done79
done80
done81
done82
done83
done84
done85
done86
done87
done88
done89
done90
done91
done92
done93
done94
done95
done96
done97
done98
done99
done100
done101
done102
done103
done104
done105
done106
done107
done108
done109
done110
done111
done112
done113
done114
done115
done116
done117
done118
done119
done120
done121
done122
done123
done124
done125
done126
done127
done128
done129
done130
done131
done132
done133
done134
done135
done136
done137
done13

In [23]:
type(data["encodings"][1])

torch.Tensor

In [24]:
data["encodings"] = lst

In [25]:
data.to_pickle("commands_V2.pkl")

In [42]:
# data.to_csv("commands.csv")