# Imports

In [12]:
import nltk
nltk.download('punkt')
import pandas as pd

from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wrb20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Helper Functions

# Initialize Models

In [13]:
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("voidful/context-only-question-generator")
generator_model = AutoModelForSeq2SeqLM.from_pretrained("voidful/context-only-question-generator")

In [14]:
def readable_print(text):
    # Replace each period with a period followed by a newline character
    modified_text = text.replace('. ', '.\n')
    print(modified_text)


def generate_questions(model, tokenizer, context: str, num_questions: int = 4):
    """
    Generate questions from a given context using the model
    """

    input_ids = tokenizer(context, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, 
                            max_new_tokens=50,
                            num_return_sequences=num_questions)
                            #  temperature=0.9,
                            #  do_sample=True)
    questions = list({tokenizer.decode(output, skip_special_tokens=True) for output in outputs})
    return questions


def semantic_comparison(model, generated_questions: list[str], dataset_questions: list[str]):
    """
    Compare the generated questions with the dataset questions using the model based on semantic similarity   
    """

    generated_embeddings = model.encode(generated_questions)
    dataset_embeddings = model.encode(dataset_questions)

    semantic_similarities = util.pytorch_cos_sim(generated_embeddings, dataset_embeddings)

    return semantic_similarities.max(dim=1).values.mean().item()

    
def bleu_comparison(generated_questions: list[str], dataset_questions: list[str]):
    """
    Compare the generated questions with the dataset questions using BLEU score
    """

    bleu_scores = []
    for gq in generated_questions:
        generated_question = word_tokenize(gq.lower())

        # tokenize the questions from the dataset
        ref_questions = [word_tokenize(ref_q.lower()) for ref_q in dataset_questions]

        bleu_score = sentence_bleu(ref_questions, generated_question)

        bleu_score = bleu_score if bleu_score >= .00001 else 0

        bleu_scores.append(bleu_score)


    average_bleu_score = sum(bleu_scores) / len(bleu_scores)
    return average_bleu_score


# Get Questions

In [15]:
articles = pd.read_json("../data/xquad.en.json")

articles = [a for a in articles["data"]]
articles

[{'paragraphs': [{'context': "The Panthers defense gave up just 308 points, ranking sixth in the league, while also leading the NFL in interceptions with 24 and boasting four Pro Bowl selections. Pro Bowl defensive tackle Kawann Short led the team in sacks with 11, while also forcing three fumbles and recovering two. Fellow lineman Mario Addison added 6½ sacks. The Panthers line also featured veteran defensive end Jared Allen, a 5-time pro bowler who was the NFL's active career sack leader with 136, along with defensive end Kony Ealy, who had 5 sacks in just 9 starts. Behind them, two of the Panthers three starting linebackers were also selected to play in the Pro Bowl: Thomas Davis and Luke Kuechly. Davis compiled 5½ sacks, four forced fumbles, and four interceptions, while Kuechly led the team in tackles (118) forced two fumbles, and intercepted four passes of his own. Carolina's secondary featured Pro Bowl safety Kurt Coleman, who led the team with a career high seven interceptions,

In [16]:
# crete a list of tuples, each tuple contains the title, context and question

cq_pairs = {}
for a in articles:
    title = a["title"]
    for p in a["paragraphs"]:
        context = p["context"]
        cq_pairs[context] = [qas["question"] for qas in p["qas"]]   
            
cq_pairs[list(cq_pairs.keys())[0]]

['How many points did the Panthers defense surrender?',
 'How many career sacks did Jared Allen have?',
 'How many tackles did Luke Kuechly register?',
 'How many balls did Josh Norman intercept?',
 'Who registered the most sacks on the team this season?',
 'How many interceptions are the Panthers defense credited with in 2015?',
 'Who led the Panthers in sacks?',
 'How many Panthers defense players were selected for the Pro Bowl?',
 'How many forced fumbles did Thomas Davis have?',
 'Which player had the most interceptions for the season?',
 "How many 2015 season interceptions did the Panthers' defense get?",
 'Who had five sacks in nine games as a Carolina Panthers starter?',
 "Who was the Panthers' tackle leader for 2015?",
 'How many interceptions did Josh Norman score touchdowns with in 2015?']

In [17]:
len(cq_pairs)

240

# Test Models for Question Generation

In [18]:
context = list(cq_pairs.keys())[0]
readable_print(context)

The Panthers defense gave up just 308 points, ranking sixth in the league, while also leading the NFL in interceptions with 24 and boasting four Pro Bowl selections.
Pro Bowl defensive tackle Kawann Short led the team in sacks with 11, while also forcing three fumbles and recovering two.
Fellow lineman Mario Addison added 6½ sacks.
The Panthers line also featured veteran defensive end Jared Allen, a 5-time pro bowler who was the NFL's active career sack leader with 136, along with defensive end Kony Ealy, who had 5 sacks in just 9 starts.
Behind them, two of the Panthers three starting linebackers were also selected to play in the Pro Bowl: Thomas Davis and Luke Kuechly.
Davis compiled 5½ sacks, four forced fumbles, and four interceptions, while Kuechly led the team in tackles (118) forced two fumbles, and intercepted four passes of his own.
Carolina's secondary featured Pro Bowl safety Kurt Coleman, who led the team with a career high seven interceptions, while also racking up 88 tack

In [19]:
questions = generate_questions(generator_model, tokenizer, context)
questions

['How many more sacks did Josh Norman have than Thomas Davis and Luke Kuechly combined?',
 'How many sacks did Josh Norman, Thomas Davis, and Luke Kuechly have combined?',
 'How many more sacks did Josh Norman have compared to Thomas Davis and Luke Kuechly?',
 'How many more sacks did Josh Norman have than Thomas Davis?']

In [20]:
bleu_comparison(questions, cq_pairs[context])

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.0

In [21]:
similarities = semantic_comparison(semantic_model, questions, cq_pairs[context])

In [25]:
score_dfs = []

for context in list(cq_pairs.keys()):
    dataset_q = cq_pairs[context]
    generated_q = generate_questions(generator_model, tokenizer, context)
    semantic_score = semantic_comparison(semantic_model, generated_q, dataset_q)
    bleu_score = bleu_comparison(generated_q, dataset_q)

    df = pd.DataFrame({
        "context": [context],
        "dataset_q": [dataset_q],
        "generated_q": [generated_q],
        "semantic_score": [semantic_score],
        "bleu_score": [bleu_score]
    })

    score_dfs.append(df)

score_df = pd.concat(score_dfs, axis=0)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [28]:
score_df.reset_index(drop=True, inplace=True)
score_df

Unnamed: 0,context,dataset_q,generated_q,semantic_score,bleu_score
0,"The Panthers defense gave up just 308 points, ...",[How many points did the Panthers defense surr...,[How many more sacks did Josh Norman have than...,tensor(0.7648),0.000000
1,The Broncos defeated the Pittsburgh Steelers i...,[Who lost to the Broncos in the divisional rou...,"[How many points did the Broncos win by?, How ...",tensor(0.8442),0.540681
2,Peyton Manning became the first quarterback ev...,[How old was Peyton Manning when he played in ...,"[How old is Manning?, How old is Peyton Mannin...",tensor(0.9298),0.447514
3,Six-time Grammy winner and Academy Award nomin...,"[How many Grammys has Lady Gaga won?, What did...",[Which actress provided American Sign Language...,tensor(0.5931),0.000000
4,"With 4:51 left in regulation, Carolina got the...",[On what yard line did Carolina begin with 4:5...,[How many points did Denver lead by at halftim...,tensor(0.6866),0.081167
...,...,...,...,...,...
235,Aristotle provided a philosophical discussion ...,[Who provided a philosophical discussion of fo...,[How many elements did Aristotle think the ter...,tensor(0.9364),0.554980
236,The development of fundamental theories for fo...,[Who formed the universal theory of gravitatio...,[What is the least popular approach to answeri...,tensor(0.3990),0.000000
237,"Since then, and so far, general relativity has...","[What theory best explains gravity?, What spac...",[Which direction is the ballistic path of a ba...,tensor(0.5262),0.000000
238,Through combining the definition of electric c...,[What is the time rate of change of electric ...,[What is the combination of electric current a...,tensor(0.5869),0.000000


In [33]:
score_df.describe()

Unnamed: 0,semantic_score,bleu_score
count,240.0,240.0
mean,0.661518,0.087652
std,0.164102,0.140521
min,0.172758,0.0
25%,0.551521,0.0
50%,0.684097,0.0
75%,0.784199,0.144642
max,0.976172,0.596097


In [39]:
best_bleu = score_df.loc[score_df['semantic_score'].idxmax()]
readable_print(best_bleu['context'])

The first Methodist clergy were ordained by John Wesley, a priest of the Church of England, because of the crisis caused by the American Revolution which isolated the Methodists in the States from the Church of England and its sacraments.
Today, the clergy includes men and women who are ordained by bishops as elders and deacons and are appointed to various ministries.
Elders in the United Methodist Church itenerate and are subject to the authority and appointment of their bishops.
They generally serve as pastors in local congregations.
Deacons are in service ministry and may serve as musicians, liturgists, educators, business administrators, and a number of other areas.
Elders and deacons are required to obtain a master's degree (generally an M.Div.), or another equivalent degree, before commissioning and then ultimately ordination.
Elders in full connection are each a member of their Annual Conference Order of Elders.
Likewise each deacon in full connection is a member of their Annual

In [40]:
best_bleu['generated_q']

['Who were the first Methodist clergy?',
 'Who was the first Methodist clergy ordained?',
 'Who ordained the first Methodist clergy?',
 'Who was the first Methodist clergy ordained by?']

In [41]:
best_bleu['dataset_q']

['Who ordained the first Methodist clergy?',
 'Clergy usually serve as what in local congregations?',
 'Elders in full connection are each a member of what?',
 'Each deacon in full connection is a member of what?']