In [4]:
import json
import requests
import pandas as pd
import torch
import torch.optim as optim
import tensorflow
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from flask import Flask, request, jsonify
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config

# Download CBT
cbt_url = "https://s3.amazonaws.com/text-datasets/CBTest/data/cbtest_CN_train.txt"
cbt_data = requests.get(cbt_url).text.splitlines()

# Download SQuAD
squad_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
squad_data = json.loads(requests.get(squad_url).text)["data"]

# Process CBT
def process_cbt_data(cbt_data):
    passages = []
    questions = []
    answers = []

    passage = []
    for line in cbt_data:
        if line.startswith("_BOOK_TITLE_"):
            passage = []
        elif line.startswith("21 "):
            parts = line[3:].split("\t")
            questions.append(parts[0])
            answers.append(parts[2])
            passages.append(" ".join(passage))
        else:
            passage.append(line)
            if len(passage) > 20:
                passage.pop(0)

    return pd.DataFrame({"passage": passages, "question": questions, "answer": answers})


cbt_df = process_cbt_data(cbt_data)

# Process SQuAD
def process_squad_data(squad_data):
    passages = []
    questions = []
    answers = []

    for article in squad_data:
        for paragraph in article["paragraphs"]:
            passage = paragraph["context"]
            for qa in paragraph["qas"]:
                if not qa["is_impossible"]:
                    questions.append(qa["question"])
                    answers.append(qa["answers"][0]["text"])
                    passages.append(passage)

    return pd.DataFrame({"passage": passages, "question": questions, "answer": answers})

squad_df = process_squad_data(squad_data)

# Combine and split the datasets
combined_df = pd.concat([cbt_df, squad_df], ignore_index=True)
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Save the preprocessed data
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

# Phase 3: Model Selection
# model = T5ForConditionalGeneration.from_pretrained("t5-small")
# tokenizer = T5Tokenizer.from_pretrained("t5-small")
# Download the T5 config
config = T5Config.from_pretrained("t5-small")

# Load the TensorFlow model
tf_model = T5ForConditionalGeneration.from_pretrained("t5-small", from_tf=True, config=config)

# Save the PyTorch model
tf_model.save_pretrained("t5-small-pytorch")

# Load the PyTorch model
model = T5ForConditionalGeneration.from_pretrained("t5-small-pytorch")
tokenizer = T5Tokenizer.from_pretrained("t5-small",model_max_length=512)

# Phase 4: Model Training
def tokenize_data(df):
    inputs = tokenizer(df["question"].tolist(), return_tensors="pt", padding=True, truncation=True)
    targets = tokenizer(df["answer"].tolist(), return_tensors="pt", padding=True, truncation=True)
    return inputs, targets

train_inputs, train_targets = tokenize_data(train_df)
train_data = DataLoader(list(zip(train_inputs["input_ids"], train_targets["input_ids"])), batch_size=8, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=5e-5)

if torch.cuda.is_available():
    model = model.cuda()


model.train()
for epoch in range(3):
    for batch in train_data:
        optimizer.zero_grad()
        inputs, targets = batch
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            targets = targets.cuda()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


# #This Runs on the cpu
# model.train()
# for epoch in range(3):
#     for batch in train_data:
#         optimizer.zero_grad()
#         inputs, targets = batch
#         outputs = model(input_ids=inputs, labels=targets)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

# Phase 5: Model Evaluation
test_inputs, test_targets = tokenize_data(test_df)
test_data = DataLoader(list(zip(test_inputs["input_ids"], test_targets["input_ids"])), batch_size=8)

#on Gpu
model.eval()
total_loss = 0
for batch in test_data:
    inputs, targets = batch
    if torch.cuda.is_available():
        inputs = inputs.cuda()
        targets = targets.cuda()
    with torch.no_grad():
        outputs = model(input_ids=inputs, labels=targets)
    total_loss += outputs.loss.item()

average_test_loss = total_loss / len(test_data)

#on cpu
# model.eval()
# total_loss = 0
# for batch in test_data:
#     inputs, targets = batch
#     with torch.no_grad():
#         outputs = model(input_ids=inputs, labels=targets)
#     total_loss += outputs.loss.item()

# average_test_loss = total_loss / len(test_data)




2023-04-08 13:11:13.710079: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 65798144 exceeds 10% of free system memory.
All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


In [6]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/')
def home():
    return "Hello, World!"

@app.route('/generate_questions', methods=['POST'])
def generate_questions():
    input_text = request.json['input_text']
    tokenized_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output_tokens = model.generate(**tokenized_input)
    generated_questions = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
    return jsonify(generated_questions)

if __name__ == '__main__':
    app.run()
#cpu Alone

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
[2023-04-08 14:22:32,232] ERROR in app: Exception on /generate_questions [POST]
Traceback (most recent call last):
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 2528, in wsgi_app
    response = self.full_dispatch_request()
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "/tmp/ipykernel_739/2190046079.py", line 14, in generate_questions
    output_tokens = model.generate(**tokenized_input)
  File "/home/adesoji/.local/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 11

In [7]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/generate_questions', methods=['POST'])
def generate_questions():
    input_text = request.json['input_text']
    tokenized_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Move input tensors to GPU
    tokenized_input = {key: tensor.to("cuda") for key, tensor in tokenized_input.items()}

    with torch.no_grad():
        output_tokens = model.generate(**tokenized_input)
    generated_questions = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
    return jsonify(generated_questions)


if __name__ == '__main__':
    app.run()



 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [08/Apr/2023 14:25:48] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [08/Apr/2023 14:25:51] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [08/Apr/2023 14:25:53] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [08/Apr/2023 14:25:54] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [08/Apr/2023 14:26:38] "GET /generate_questions HTTP/1.1" 405 -
127.0.0.1 - - [08/Apr/2023 14:27:57] "POST /generate_questions HTTP/1.1" 200 -
127.0.0.1 - - [08/Apr/2023 14:28:05] "POST /generate_questions HTTP/1.1" 200 -
127.0.0.1 - - [08/Apr/2023 14:28:28] "POST /generate_questions HTTP/1.1" 200 -
127.0.0.1 - - [08/Apr/2023 14:30:04] "POST /generate_questions HTTP/1.1" 200 -


In [None]:
curl -X POST -H "Content-Type: application/json" -d '{"input_text": "Children playing in a park on a sunny day"}' http://127.0.0.1:5000/generate_questions

In [8]:
import json
import requests
import pandas as pd
import torch
import torch.optim as optim
import tensorflow
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from flask import Flask, request, jsonify
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config

# # Load the local datasets
# cbt_df = pd.read_csv("CBTest_CN_train.csv")
# squad_df = pd.read_csv("SQuAD_train-v2.0.csv")

# # Assuming the CSV files have columns: 'passage', 'question', 'answer'
# # If not, you may need to rename the columns or adjust the following code accordingly

# # Combine and split the datasets
# combined_df = pd.concat([cbt_df, squad_df], ignore_index=True)
# train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# # Save the preprocessed data
# train_df.to_csv("train_data.csv", index=False)
# test_df.to_csv("test_data.csv", index=False)
train_df =pd.read_csv('/home/adesoji/targetdir/test_data.csv')
test_df = pd.read_csv('/home/adesoji/targetdir/test_data.csv')


# Phase 3: Model Selection
# model = T5ForConditionalGeneration.from_pretrained("t5-small")
# tokenizer = T5Tokenizer.from_pretrained("t5-small")
# Download the T5 config
config = T5Config.from_pretrained("t5-small")

# Load the TensorFlow model
tf_model = T5ForConditionalGeneration.from_pretrained("t5-small", from_tf=True, config=config)

# Save the PyTorch model
tf_model.save_pretrained("t5-small-pytorch")

# Load the PyTorch model
model = T5ForConditionalGeneration.from_pretrained("t5-small-pytorch")
tokenizer = T5Tokenizer.from_pretrained("t5-small",model_max_length=512)

# Phase 4: Model Training
def tokenize_data(df):
    question_prefix = "generate a question: "
    inputs = tokenizer([question_prefix + question for question in df["question"].tolist()], return_tensors="pt", padding=True, truncation=True)
    targets = tokenizer(df["answer"].tolist(), return_tensors="pt", padding=True, truncation=True)
    return inputs, targets


train_inputs, train_targets = tokenize_data(train_df)
train_data = DataLoader(list(zip(train_inputs["input_ids"], train_targets["input_ids"])), batch_size=8, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=5e-5)

if torch.cuda.is_available():
    model = model.cuda()


model.train()
for epoch in range(3):
    for batch in train_data:
        optimizer.zero_grad()
        inputs, targets = batch
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            targets = targets.cuda()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


# #This Runs on the cpu
# model.train()
# for epoch in range(3):
#     for batch in train_data:
#         optimizer.zero_grad()
#         inputs, targets = batch
#         outputs = model(input_ids=inputs, labels=targets)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

# Phase 5: Model Evaluation
test_inputs, test_targets = tokenize_data(test_df)
test_data = DataLoader(list(zip(test_inputs["input_ids"], test_targets["input_ids"])), batch_size=8)

#on Gpu
model.eval()
total_loss = 0
for batch in test_data:
    inputs, targets = batch
    if torch.cuda.is_available():
        inputs = inputs.cuda()
        targets = targets.cuda()
    with torch.no_grad():
        outputs = model(input_ids=inputs, labels=targets)
    total_loss += outputs.loss.item()

average_test_loss = total_loss / len(test_data)

All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


In [9]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/generate_questions', methods=['POST'])
def generate_questions():
    input_text = request.json['input_text']
    tokenized_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Move input tensors to GPU
    tokenized_input = {key: tensor.to("cuda") for key, tensor in tokenized_input.items()}

    with torch.no_grad():
        output_tokens = model.generate(**tokenized_input)
    generated_questions = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
    return jsonify(generated_questions)


if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [08/Apr/2023 15:05:20] "POST /generate_questions HTTP/1.1" 200 -
127.0.0.1 - - [08/Apr/2023 15:05:33] "POST /generate_questions HTTP/1.1" 200 -


In [1]:
import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from flask import Flask, request, jsonify

# Load the local datasets
train_df =pd.read_csv('QuestandAnswers/test_data.csv')
test_df = pd.read_csv('QuestandAnswers/test_data.csv')

# Model Selection and model loading
config = T5Config.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small-pytorch", config=config)
tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=512)

# Model Training
def tokenize_data(df):
    inputs = tokenizer(df["question"].tolist(), return_tensors="pt", padding=True, truncation=True)
    targets = tokenizer(df["answer"].tolist(), return_tensors="pt", padding=True, truncation=True)
    return inputs, targets

train_inputs, train_targets = tokenize_data(train_df)
train_data = DataLoader(list(zip(train_inputs["input_ids"], train_targets["input_ids"])), batch_size=8, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=5e-5)

if torch.cuda.is_available():
    model = model.cuda()

model.train()
for epoch in range(12):
    total_train_loss = 0
    for batch in train_data:
        optimizer.zero_grad()
        inputs, targets = batch
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            targets = targets.cuda()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    average_train_loss = total_train_loss / len(train_data)
    print(f"Epoch {epoch+1}, Train Loss: {average_train_loss:.4f}")


# Model Evaluation
test_inputs, test_targets = tokenize_data(test_df)
test_data = DataLoader(list(zip(test_inputs["input_ids"], test_targets["input_ids"])), batch_size=8)

model.eval()
total_loss = 0
correct = 0
total = 0
for batch in test_data:
    inputs, targets = batch
    if torch.cuda.is_available():
        inputs = inputs.cuda()
        targets = targets.cuda()
    with torch.no_grad():
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        total_loss += loss.item()
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        correct += (predictions == targets).sum().item()
        total += targets.size(0)

average_test_loss = total_loss / len(test_data)
accuracy = correct / total
print(f"Validation Loss: {average_test_loss:.4f}, Accuracy: {accuracy:.4f}")

# Model Deployment
app = Flask(__name__)

@app.route('/generate_answer', methods=['POST'])
def generate_answer():
    input_data = request.json
    input_text = input_data.get('passage', '') + ' ' + input_data.get('question', '')
    tokenized_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    if torch.cuda.is_available():
        tokenized_input = {k: v.cuda() for k, v in tokenized_input.items()}
    with torch.no_grad():
        output_tokens = model.generate(**tokenized_input)
    generated_answer = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
    return jsonify({"answer": generated_answer})

if __name__ == '__main__':
    app.run()



FileNotFoundError: [Errno 2] No such file or directory: '/home/adesoji/nltk_data/RAndom-Nlp-CV-projects/Q&A/test_data.csv'

In [None]:
curl -X POST -H "Content-Type: application/json" -d '{"passage": "Children were playing in a park on a sunny day.", "question": "Who is biden ?"}' http://127.0.0.1:5000/generate_answer

In [4]:
# # Model Deployment on cpu
# app = Flask(__name__)

# @app.route('/generate_answer', methods=['POST'])
# def generate_answer():
#     input_data = request.json
#     input_text = input_data.get('passage', '') + ' ' + input_data.get('question', '')
#     tokenized_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
#     with torch.no_grad():
#         output_tokens = model.generate(**tokenized_input)
#     generated_answer = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
#     return jsonify({"answer": generated_answer})

# if __name__ == '__main__':
#     app.run()
#Model Deployment
app = Flask(__name__)

@app.route('/generate_answer', methods=['POST'])
def generate_answer():
    input_data = request.json
    input_text = input_data.get('passage', '') + ' ' + input_data.get('question', '')
    tokenized_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output_tokens = model.generate(**tokenized_input)
    generated_answer = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
    return jsonify({"answer": generated_answer})

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
[2023-04-08 18:42:31,181] ERROR in app: Exception on /generate_answer [POST]
Traceback (most recent call last):
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 2528, in wsgi_app
    response = self.full_dispatch_request()
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
  File "/home/adesoji/.local/lib/python3.8/site-packages/flask/app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "/tmp/ipykernel_36114/4266315263.py", line 25, in generate_answer
    output_tokens = model.generate(**tokenized_input)
  File "/home/adesoji/.local/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, i