In [None]:
! pip -q install transformers

In [None]:
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Libraries

In [2]:
import pickle
import random
import re
import torch
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm, trange
from pathlib import Path
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup
)

In [4]:
!pip -q install tensorboardX

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.5/101.7 kB[0m [31m941.6 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m51.2/101.7 kB[0m [31m849.0 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
from tensorboardX import SummaryWriter

In [6]:
model_size = "small"

## Defining Arguments

In [7]:

class Args():
    def __init__(self):
        self.output_dir = f'/content/drive/MyDrive/ChatBotAITestingTool/output/output-{model_size}'
        self.model_type = 'gpt3'
        self.model_name_or_path = f'microsoft/DialoGPT-{model_size}'
        self.config_name = f'microsoft/DialoGPT-{model_size}'
        self.tokenizer_name = f'microsoft/DialoGPT-{model_size}'
        self.cache_dir = '/content/drive/MyDrive/ChatBotProject/cached'
        self.block_size = 64
        self.per_gpu_train_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.01
        self.adam_epsilon = 1e-4
        self.max_grad_norm = 1.0
        self.num_train_epochs = 30
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_total_limit = None
        self.seed = 42
        self.local_rank = -1
        self.padding_side='left'
args = Args()

## Data Loading

In [None]:
df= pd.read_csv('/content/drive/Shareddrives/DATA-298A - MSDA Project I/Datasets/Text-Based/AI test tool/qa.csv')

In [None]:
df=df[['Questions','Answers']]

In [None]:
df.dropna(axis=0, inplace=True)

## Contexts

In [None]:
Questions= df.Questions.values
Answers= df.Questions.values

qa=[]
for i in range(len(Questions)):
  qa.append(Questions[i])
  qa.append(Answers[i])

In [None]:
contexts = []

n = 7

for i in range(n, len(qa)):

  row = []
  prev = i - 1 - n
  for j in range(i, prev, -1):
    row.append(qa[j])
  contexts.append(row)

columns = ['Answer'] + ['context '+str(i+1) for i in range(n)]

df_new = pd.DataFrame.from_records(contexts, columns=columns)

df_new.head()

Unnamed: 0,Answer,context 1,context 2,context 3,context 4,context 5,context 6,context 7
0,Why I am not able to login to AI testing tool?,Why I am not able to login to AI testing tool?,Why I am getting an error while login?,Why I am getting an error while login?,How to sign up for AI testing tool,How to sign up for AI testing tool,How to signup in the AI testing tool?,How to signup in the AI testing tool?
1,How to create a new project?,Why I am not able to login to AI testing tool?,Why I am not able to login to AI testing tool?,Why I am getting an error while login?,Why I am getting an error while login?,How to sign up for AI testing tool,How to sign up for AI testing tool,How to signup in the AI testing tool?
2,How to create a new project?,How to create a new project?,Why I am not able to login to AI testing tool?,Why I am not able to login to AI testing tool?,Why I am getting an error while login?,Why I am getting an error while login?,How to sign up for AI testing tool,How to sign up for AI testing tool
3,How to look at previous created function?,How to create a new project?,How to create a new project?,Why I am not able to login to AI testing tool?,Why I am not able to login to AI testing tool?,Why I am getting an error while login?,Why I am getting an error while login?,How to sign up for AI testing tool
4,How to look at previous created function?,How to look at previous created function?,How to create a new project?,How to create a new project?,Why I am not able to login to AI testing tool?,Why I am not able to login to AI testing tool?,Why I am getting an error while login?,Why I am getting an error while login?


In [None]:
# Load and cache examples
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

def load_and_cache_examples(args, tokenizer, df_trn):
    return ConversationDataset(tokenizer, args, df_trn)

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)
        directory = args.cache_dir
        cached_features_file = os.path.join(directory, args.model_type + "_cached_lm_" + str(block_size))
        self.examples = []
        for _, row in df.iterrows():
            conv = construct_conv(row, tokenizer)
            self.examples.append(conv)
        with open(cached_features_file, "wb") as handle:
            pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

def load_and_cache_examples(args, tokenizer, df_trn):
    return ConversationDataset(tokenizer, args, df_trn)

In [None]:
# Model training
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter('/content/drive/MyDrive/run2')
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate,drop_last=True)
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    model = model.module if hasattr(model, "module") else model
    model.resize_token_embeddings(len(tokenizer))
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    global_step, epochs_trained = 0, 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed= 42
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024:
                continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
    tb_writer.close()
    return global_step, tr_loss / global_step, 1 - (tr_loss / global_step)

In [None]:
def main(df_trn):
    args = Args()
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed = 42
    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, from_tf=False, config=config, cache_dir=args.cache_dir)
    model.to(args.device)
    train_dataset = load_and_cache_examples(args, tokenizer, df_trn)
    global_step, overall_loss, overall_accuracy = train(args, train_dataset, model, tokenizer)
    os.makedirs('/content/drive/MyDrive/ChatBotAITestingTool/output2/output-small', exist_ok=True)
    model_to_save = (model.module if hasattr(model, "module") else model)
    model_to_save.save_pretrained('/content/drive/MyDrive/ChatBotAITestingTool/output2/output-small')
    tokenizer.save_pretrained(args.output_dir)
    torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
    return overall_loss, overall_accuracy

## Model Training

In [None]:
loss, accuracy = main(df_new)

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

In [None]:
print(loss)

0.738796446791717


In [None]:
print(accuracy)

0.26120355320828303
