#Load SHP Dataset

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import random

In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("stanfordnlp/shp")

In [None]:
# Checking domains
domains = [row['domain'] for row in dataset['train']]
# Getting unique values
unique_domains = set(domains)
#print(len(unique_domains))
for domain in unique_domains:
  print(domain)

In [None]:
dataset['train'].column_names

In [None]:
# Convert dataset to df
data_df = pd.DataFrame(dataset['train'])

# Create empty DataFrame - 50rows per category
df = pd.DataFrame(columns=['domain', 'history', 'human_ref_A'])

# Iterate over each unique domain
for domain in data_df['domain'].unique():
    sampled_rows = data_df[data_df['domain'] == domain].sample(n=50, replace=False)  # Sample 50 random rows for the current domain
    df = pd.concat([df, sampled_rows[['domain', 'history', 'human_ref_A']]], ignore_index=True) # Append rows to df

In [None]:
df.head()

In [None]:
df.shape[0]/18

In [None]:
file_path = '/content/drive/My Drive/Dynamic_gen_input.csv'
df.to_csv(file_path, index=False)

#Get Reward Scores

In [None]:
file_path = '/content/drive/My Drive/Dynamic_gen_input.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
import torch
assert torch.cuda.is_available()
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2"
rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name), AutoTokenizer.from_pretrained(reward_name)

In [None]:
for index, row in df.iterrows():
    print("Scoring sentence: ", index)
    question, answer = str(row['history']), str(row['answer'])
    inputs = tokenizer(question, answer, return_tensors='pt')
    score = rank_model(**inputs).logits[0].cpu().detach()
    score = score.item()
    df.at[index, 'score'] = score

In [None]:
Question = "Where does the sun rise?"
answer = "The Sun rises in the east."
output = compute_reward(Question, answer)
output

In [None]:
# Standardize the reward scores.
# Alternate: min-max normalization
df['z_normalized_reward'] = (df['score'] - df['score'].mean()) / df['score'].std()

In [None]:
df['z_normalized_reward'].mean()

In [None]:
df['z_normalized_reward'].max()

In [None]:
df['z_normalized_reward'].min()

In [None]:
df.to_csv(file_path, index=False)

#Dynamic Question Generation

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import random

In [None]:
file_path = '/content/drive/My Drive/Dynamic_gen_input.csv'
output_path = "/content/drive/My Drive/Dynamic_gen_output.csv"
output_df = pd.DataFrame(columns=['Category', 'Question'])
df = pd.read_csv(file_path)
df.head()

In [None]:
#New dataframe - 8 random rows from each of the 18 categories: -
Gemini_input_df = pd.DataFrame()

for _, group in df.groupby('domain'):
    random_sample = group.sample(n=8, replace=False)
    Gemini_input_df = pd.concat([Gemini_input_df, random_sample], ignore_index=True)

In [None]:
#Build a string from the 108 examples - goes as input to the Gemini model
Gemini_input_df['z_normalized_reward'] = Gemini_input_df['z_normalized_reward'].astype(str)
Gemini_input_string = ""
for index, row in Gemini_input_df.iterrows():
    Gemini_input_string += ("Domain: " + row['domain'] +
                            ". Question: " + row['history'] +
                            " Answer: " + row['human_ref_A'] +
                            " Helpfulness Score for the Answer: " + row['z_normalized_reward'] + ". ")
Gemini_input_string

In [None]:
#Imports
import pathlib
import re
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

#Import to store API on Colab
from google.colab import userdata

In [None]:
#Put API Key in environment variable under 'secrets'
GOOGLE_API_KEY = userdata.get('GOOGL_API_KEY')
genai.configure(api_key = GOOGLE_API_KEY)

In [None]:
model = genai.GenerativeModel('gemini-1.5-pro-latest')

In [None]:
response = model.generate_content("Where does the sun rise?")
to_markdown(response.text) #Prints output.

In [None]:
prompt = """You are a writing expert. I will give you a set of 144 questions and answers along with the domain of the question, and the helpfulness score for the answer with respect to the question.
A higher score means that the answer is helpful with respect to the question.
Go through all the  examples and identify the domains in which the answers have received a poor score.
Also, among the questions which have received a poor score, check for common patterns. For example, all these questions might be fact-related, or might use some proper noun in them.
Similarly, check for common patterns in the domains which have received a high score.
Your task is to come up with a set of 20 questions, primarily focused on domains that have received a low score. Make sure that you use common patterns from the high performing questions while framing these questions.
Print exactly a numbered list of 20 items. Each item should first the category name in paranthesis followed by the question you have generated.
Here are the questions along with the corresponding domains, answers, and helpfulness scores.
"""


In [None]:
prompt = prompt + Gemini_input_string
response = model.generate_content(prompt)
to_markdown(response.text) #Prints output.

In [None]:
"""
for i in range(20):
  response = model.generate_content(prompt)
  print(i)
"""

In [None]:
Gem_out = response.text
Gem_out

In [None]:
# Remove everything before the first question
Gem_out_cleaned = re.sub(r'^.*?1\.', '1.', Gem_out, flags=re.DOTALL)

# Splitting the string into a list of questions
questions = re.split(r'\d+\.\s*\(', Gem_out_cleaned)

# Removing the first empty element if present
if questions[0] == '':
    questions.pop(0)

# Parsing each question to extract category and the question text
for question in questions:
    match = re.match(r'(.*?)\)\s*(.*)', question)
    if match:
        category = match.group(1).strip()
        question_text = match.group(2).strip()
        new_row = pd.DataFrame({'Category': [category], 'Question': [question_text]})
        output_df = pd.concat([output_df, new_row], ignore_index=True)




In [None]:
output_df.iloc[0][-1]

# Loop

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import random

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/Dynamic_gen_input.csv'
output_path = "/content/drive/My Drive/Dynamic_gen_output.csv"
output_df = pd.read_csv(output_path)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,domain,history,human_ref_A,reward score,z_normalized_reward
0,askacademia_train,Working during maternity leave in academia: Wh...,Unfortunately academia is relentless. PIs see ...,0.982035,0.740292
1,askacademia_train,The 'Other' College Scandal: Grade Inflation H...,"Alternative suggestion, get rid of grades and ...",0.955293,0.521819
2,askacademia_train,Access denied to the last version of a paper w...,How are you supposed to address their concerns...,0.771383,-0.98061
3,askacademia_train,How do you deal with presentation anxiety. Hel...,Gather some folks and present in front of them...,0.913921,0.183842
4,askacademia_train,"""....managers should always make love to their...","I mean, yeah. That is why we hire the hot ones",0.65191,-1.956629


In [None]:
#Imports
import pathlib
import re
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

#Import to store API on Colab
from google.colab import userdata


In [None]:
#Put API Key in environment variable under 'secrets'
GOOGLE_API_KEY = userdata.get('GOOGL_API_KEY')
genai.configure(api_key = GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-pro-latest')

In [None]:
response = model.generate_content("Where does the sun rise?")
to_markdown(response.text) #Prints output.

> The sun always rises in the **East**. 


In [None]:
prompt = """You are a writing expert. I will give you a set of 144 questions and answers along with the domain of the question, and the helpfulness score for the answer with respect to the question.
A higher score means that the answer is helpful with respect to the question.
Go through all the  examples and identify the domains in which the answers have received a poor score.
Also, among the questions which have received a poor score, check for common patterns. For example, all these questions might be fact-related, or might use some proper noun in them.
Similarly, check for common patterns in the domains which have received a high score.
Your task is to come up with a set of 20 questions, primarily focused on domains that have received a low score. Make sure that you use common patterns from the high performing questions while framing these questions.
Print exactly a numbered list of 20 items. Each item should first the category name in paranthesis followed by the question you have generated.
Here are the questions along with the corresponding domains, answers, and helpfulness scores.
"""
i = 1

In [None]:
while len(output_df) < 2020:
  # New dataframe - 8 random rows from each of the 18 categories: -
  Gemini_input_df = pd.DataFrame()

  for _, group in df.groupby('domain'):
      random_sample = group.sample(n=8, replace=False)
      Gemini_input_df = pd.concat([Gemini_input_df, random_sample], ignore_index=True)
  Gemini_input_df['z_normalized_reward'] = Gemini_input_df['z_normalized_reward'].astype(str)
  Gemini_input_string = ""
  for index, row in Gemini_input_df.iterrows():
      Gemini_input_string += ("Domain: " + row['domain'] +
                              ". Question: " + row['history'] +
                              " Answer: " + row['human_ref_A'] +
                              " Helpfulness Score for the Answer: " + row['z_normalized_reward'] + ". ")

  prompt = prompt + Gemini_input_string


  response = model.generate_content(prompt)
  Gem_out = response.text
  Gem_out = Gem_out.replace("\n", "")
  # Remove everything before the first question
  Gem_out_cleaned = re.sub(r'^.*?1\.', '1.', Gem_out, flags=re.DOTALL)

  # Splitting the string into a list of questions
  questions = re.split(r'\d+\.\s*\(', Gem_out_cleaned)

  # Removing the first empty element if present
  if questions[0] == '':
      questions.pop(0)

  # Parsing each question to extract category and the question text
  for question in questions:
      match = re.match(r'(.*?)\)\s*(.*)', question)
      if match:
          category = match.group(1).strip()
          question_text = match.group(2).strip()
          new_row = pd.DataFrame({'domain': [category], 'duestion': [question_text]})
          output_df = pd.concat([output_df, new_row], ignore_index=True)
  print(i)
  i = i+1

In [None]:
output_df.to_csv(output_path, mode='a', header=False, index=False)