<a href="https://colab.research.google.com/github/ApplePie534/ApplePie534/blob/main/Question_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install --quiet  datasets pyarrow tqdm transformers tokenizers sentencepiece pytorch-lightning torchtext streamlit nltk

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 441, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 

In [14]:
%%writefile load_dataset.py
from datasets import load_dataset
from pprint import pprint

# Load the full datasets
train_dataset = load_dataset('squad', split='train')
full_valid_dataset = load_dataset('squad', split='validation')

# Select only the first 1000 samples from the validation dataset
valid_dataset = full_valid_dataset.select(range(1000))

print(f"Total Train Samples: {len(train_dataset)}")
print(f"Total Validation Samples (full): {len(full_valid_dataset)}")
print(f"Total Validation Samples (subset): {len(valid_dataset)}")

sample_validation_dataset = next(iter(valid_dataset))
pprint (sample_validation_dataset)

Overwriting load_dataset.py


In [15]:
%%writefile train_model.py

#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import os
from transformers import AdamW,T5ForConditionalGeneration,T5Tokenizer,get_linear_schedule_with_warmup
import pytorch_lightning as pl
from load_dataset import sample_validation_dataset,train_dataset,valid_dataset

device  = 'cuda' if torch.cuda.is_available() else "cpu"
print(device)

pd.options.display.max_rows , pd.options.display.max_columns  = 100,100

def create_pandas_dataset(data,
                          answer_threshold=7,
                          verbose = False):

  ''' Create a Pandas Dataframe from hugging face dataset.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_long ,count_short = 0 , 0
  result_df  = pd.DataFrame(columns = ['context', 'answer','question'])
  for index,val in enumerate(tqdm(data)):
      passage = val['context']
      question = val['question']
      answer = val['answers']['text'][0]
      no_of_words = len(answer.split())
      if no_of_words >= answer_threshold:
          count_long = count_long + 1
          continue
      else:
          result_df.loc[count_short] = [passage] + [answer] + [question]
          count_short = count_short + 1
  if verbose:
    return (result_df,
            count_long,
            count_short)
  else:
    return result_df

context = sample_validation_dataset['context']
question = sample_validation_dataset['question']
answer = sample_validation_dataset['answers']['text'][0]
print('---------------'*9)
print('\nBreaking it Down\n')
print ("context:",context)
print ("question:",question)
print ("answer:",answer)

df_train , df_validation = create_pandas_dataset(train_dataset) , create_pandas_dataset(valid_dataset)
print(f"\n Total Train Samples:{df_train.shape} , Total Validation Samples:{df_validation.shape}")

# Saving data for future use
df_train.to_parquet('train_squad.parquet')
df_validation.to_parquet('validation_squad.parquet')

from transformers import AdamW,T5ForConditionalGeneration,T5Tokenizer,get_linear_schedule_with_warmup

t5_tokenizer = T5Tokenizer.from_pretrained('t5-large',model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-large')

class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=96):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_parquet(self.path).iloc[:2000,:]

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  #squeeze to get rid of the batch dimension
        target_mask = self.targets[index]["attention_mask"].squeeze()  # convert [batch,dim] to [dim]

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for rownum,val in tqdm(self.data.iterrows()): # Iterating over the dataframe
            passage,answer,target = val[self.passage_column],val[self.answer],val[self.question]

            input_ = f"context: {passage}  answer: {answer}" # T5 Input format for question answering tasks
            target = f"question: {str(target)}" # Output format we require

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input,padding='max_length',
                truncation = True,return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output,padding='max_length',
                truncation = True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

train_path = 'train_squad.parquet' # change this accordingly
validation_path = 'validation_squad.parquet'
train_dataset = QuestionGenerationDataset(t5_tokenizer,train_path)
validation_dataset = QuestionGenerationDataset(t5_tokenizer,validation_path)

# Data Sample

train_sample = train_dataset[50] # thanks to __getitem__
decoded_train_input = t5_tokenizer.decode(train_sample['source_ids'])
decoded_train_output = t5_tokenizer.decode(train_sample['target_ids'])

print(decoded_train_input)
print(decoded_train_output)

from torch.optim import AdamW
import argparse

class T5Tuner(pl.LightningModule):

    def __init__(self,t5model, t5tokenizer,batchsize=4):
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.batch_size = batchsize

    def forward( self, input_ids, attention_mask=None,
                decoder_attention_mask=None,
                lm_labels=None):

         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

         return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size,
                          num_workers=2)

    def val_dataloader(self):
        return DataLoader(validation_dataset,
                          batch_size=self.batch_size,
                          num_workers=2)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

model = T5Tuner(t5_model,t5_tokenizer)

trainer = pl.Trainer(max_epochs = 3,accelerator=device)

trainer.fit(model)

# saving the model
os.makedirs("t5_tokenizer",exist_ok=True)
os.makedirs("t5_trained_model",exist_ok=True)
model.model.save_pretrained('t5_trained_model')
t5_tokenizer.save_pretrained('t5_tokenizer')

trained_model_path = 't5_trained_model'
trained_tokenizer = 't5_tokenizer'
device = 'cpu'

model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer)

print("Training completed and model saved.")

context ="President Donald Trump said and predicted that some states would reopen this month."
answer = "Donald Trump"
text = "context: "+context + " " + "answer: " + answer
print(text)

context ="Since its topping out in 2013, One World Trade Center in New York City has been the tallest skyscraper in the United States."
answer = "World Trade Center"
text = "context: "+context + " " + "answer: " + answer
print(text)

encoding = tokenizer.encode_plus(text,max_length =512,padding='max_length',
                                 truncation = True,
                                 return_tensors="pt").to(device)
print (encoding.keys())
input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

model.eval()
beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=72, # How long the generated questions should be
    early_stopping=True,
    num_beams=5,
    num_return_sequences=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(sent)



Overwriting train_model.py


In [16]:
! python train_model.py

Total Train Samples: 87599
Total Validation Samples (full): 10570
Total Validation Samples (subset): 1000
{'answers': {'answer_start': [177, 177, 177],
             'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football Conference (AFC) champion Denver '
            'Broncos defeated the National Football Conference (NFC) champion '
            'Carolina Panthers 24–10 to earn their third Super Bowl title. The '
            "game was played on February 7, 2016, at Levi's Stadium in the San "
            'Francisco Bay Area at Santa Clara, California. As this was the '
            '50th Super Bowl, the league emphasized the "golden anniversary" '
            'with various gold-themed initiatives, as well as temporarily '
            'suspending the tradition of naming each Super Bowl ga

In [17]:
from tqdm import tqdm
import streamlit as st
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
from load_dataset import sample_validation_dataset,train_dataset,valid_dataset

@st.cache_resource
def load_model_and_tokenizer():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = T5ForConditionalGeneration.from_pretrained('t5_trained_model').to(device)
    tokenizer = T5Tokenizer.from_pretrained('t5_tokenizer')
    return model, tokenizer, device

def get_question(context, answer, model, tokenizer, device):
    text = f"context: {context} answer: {answer}"
    encoding = tokenizer.encode_plus(text, max_length=512, padding='max_length',
                                     truncation=True, return_tensors="pt").to(device)

    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

    outs = model.generate(input_ids=input_ids,
                          attention_mask=attention_mask,
                          max_length=72,
                          num_beams=5,
                          early_stopping=True)

    question = tokenizer.decode(outs[0], skip_special_tokens=True).replace("question:", "").strip()
    return question

def calculate_bleu_scores(generated_questions, reference_questions):
    bleu_1 = []
    bleu_2 = []
    bleu_3 = []
    bleu_4 = []

    smoothie = SmoothingFunction().method4

    for gen, ref in zip(generated_questions, reference_questions):
        reference = [ref.split()]
        candidate = gen.split()

        bleu_1.append(sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie))
        bleu_2.append(sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie))
        bleu_3.append(sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie))
        bleu_4.append(sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie))

    return {
        'BLEU-1': sum(bleu_1) / len(bleu_1),
        'BLEU-2': sum(bleu_2) / len(bleu_2),
        'BLEU-3': sum(bleu_3) / len(bleu_3),
        'BLEU-4': sum(bleu_4) / len(bleu_4)
    }

# Load your fine-tuned model and tokenizer
model, tokenizer, device = load_model_and_tokenizer()

# Generate questions and prepare references
generated_questions = []
reference_questions = []

for sample in tqdm(valid_dataset):
    context = sample['context']
    answer = sample['answers']['text'][0]
    reference_question = sample['question']

    generated_question = get_question(context, answer, model, tokenizer, device)

    generated_questions.append(generated_question)
    reference_questions.append(reference_question)

# Calculate BLEU scores
bleu_scores = calculate_bleu_scores(generated_questions, reference_questions)

print("BLEU Scores:")
for metric, score in bleu_scores.items():
    print(f"{metric}: {score:.4f}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 1000/1000 [03:48<00:00,  4.38it/s]


BLEU Scores:
BLEU-1: 0.2854
BLEU-2: 0.1900
BLEU-3: 0.1354
BLEU-4: 0.0986


In [18]:
%%writefile app.py
# app.py

# Streamlit UI
st.title("Question Generator")

model, tokenizer, device = load_model_and_tokenizer()

context = st.text_area("Enter the context:", "Donald Trump is an American media personality and businessman who served as the 45th president of the United States.")
answer = st.text_input("Enter the answer:", "Donald Trump")

if st.button("Generate Question"):
    with st.spinner("Generating question..."):
        question = get_question(context, answer, model, tokenizer, device)
    st.success("Question generated!")
    st.write("Generated Question:", question)

Overwriting app.py


In [19]:
! wget -q -O - ipv4.icanhazip.com

35.233.150.194


In [None]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.233.150.194:8501[0m
[0m
[K[?25hnpx: installed 22 in 2.882s
your url is: https://slow-oranges-watch.loca.lt
