In [1]:
import json
import pandas as pd
import re

In [2]:
# Opening main Qulac JSON file
with open('qulac.json') as json_file:
    data = json.load(json_file)

    print("Type:", type(data))

Type: <class 'dict'>


In [176]:
# Open Qulac history
with open('qulac_hist012_dict.json') as json_file:
    history = json.load(json_file)

In [105]:
def process_qulac_for_wiki(data_df):
    """
    Process Qulac main dataset into a dictionary grouped by topic ('user queries') and lists of associated facets ('user intent').
    
    {topic:[
        facet,
        facet,
        ...
        ],
    topic:[
        facet,
        facet,
        ...
        ],
    ...
    }
    """
    structured_data = {}

    for _, row in data_df.iterrows():
        topic = row["topic"]
        facet_desc = row["facet_desc"]

        # Preprocess facet desc
        facet_desc = facet_desc.lower()
        facet_desc = facet_desc.strip()

        # Initialise topic if doesn't exist
        if topic not in structured_data:
            structured_data[topic] = []

        # Add facet if doesn't exist
        if facet_desc not in structured_data[topic]:
            structured_data[topic].append(facet_desc)

    return structured_data

In [106]:
# Process qulac main dataset for assisting to retrieve wiki summaries
qulac_for_wiki = process_qulac_for_wiki(data_df)

In [107]:
# Save dictionary to JSON
with open("qulac_for_wiki.json", "w", encoding="utf-8") as f:
    json.dump(qulac_for_wiki, f, indent=2)

In [108]:
# View contents of dictionary
print(json.dumps(qulac_for_wiki, indent=2))

{
  "obama family tree": [
    "find the time magazine photo essay \\\"barack obama\\'s family tree\\\".",
    "where did barack obama\\'s parents and grandparents come from?",
    "find biographical information on barack obama\\'s mother."
  ],
  "cheap internet": [
    "what are some low-cost broadband internet providers?",
    "do any internet providers still sell dial-up?",
    "who can provide inexpensive digital cable television bundled with internet service?",
    "find me some providers of free wireless internet access.",
    "i want to find cheap dsl providers.",
    "is there a way to get internet access without phone service?"
  ],
  "ritz carlton lake las vegas": [
    "find information about the ritz carlton resort at lake las vegas.",
    "find a site where i can determine room price and availability.",
    "find directions to the ritz carlton lake las vegas.",
    "find reviews of the ritz carlton lake las vegas."
  ],
  "fickle creek farm": [
    "find general informati

In [125]:
def collect_sbert_training_data(history_dict):
    """
    Collects context-question pairs from Qulac history for SBERT fine-tuning.
    - Uses the pre-built history_list to build context.
    - Removes duplicate (context, question) pairs.

    Returns: DataFrame with 'context' and 'question' columns.
    """
    data = []
    for record in history_dict.values():
        query = record["query"]
        history_list = record["history_list"]
        latest_question = record["question"]

        # Convert history_list into formatted context string
        context = query
        for entry in history_list:
            question = entry["question"]
            answer = entry["answer"]

            # Update context for the next round by adding Q/A
            context += f" [Q] {question} [A] {answer}"
        data.append((context, latest_question))
        
    # Convert set to DataFrame
    sbert_training_data = pd.DataFrame(data, columns=["context", "question"])

    # Drop duplicates
    sbert_training_data.drop_duplicates(ignore_index=True, inplace=True)
    
    return sbert_training_data

In [127]:
training_data = collect_sbert_training_data(history)

In [131]:
# Save training data to CSV
training_data.to_csv("sbert_training_data.csv", index=False)

In [None]:
len(training_data)

In [186]:
training_data[922305:922308]

Unnamed: 0,context,question


In [164]:
data = []
    
for record_id, record in history.items():
    history_id = record["history_id"]
    query = record["query"]
    question = record["question"]
    answer = record["answer"]
    history_list = record["history_list"]
    
    # Convert history_list to a formatted string
    formatted_history = " ".join([f"[Q] {h['question']} [A] {h['answer']}" for h in history_list])

    data.append((history_id, query, question, answer, formatted_history))

# Convert list of records into DataFrame
hist_df = pd.DataFrame(data, columns=["history_id", "query", "question", "answer", "history_list"])
    

In [170]:
hist_df[2000:2050]

Unnamed: 0,history_id,query,question,answer,history_list
2000,35-5,hoboken,are you looking for information about the hobo...,no i am interested in real estate listings in ...,
2001,35-6,hoboken,are you looking for information about the hobo...,no i want to find a streetlevel map of hoboken,
2002,35-1,hoboken,do you want a hotel in hoboken,no a different kind of business establishment ...,
2003,35-2,hoboken,do you want a hotel in hoboken,no i am looking for general information about ...,
2004,35-3,hoboken,do you want a hotel in hoboken,i want to know more about hobokens history,
2005,35-4,hoboken,do you want a hotel in hoboken,no i would like information on bars and nightc...,
2006,35-5,hoboken,do you want a hotel in hoboken,no estate listings,
2007,35-6,hoboken,do you want a hotel in hoboken,no i am looking for a walking map of hoboken n...,
2008,35-1,hoboken,do you want directions to hoboken,no just give me a listing of hoboken restaurants,
2009,35-2,hoboken,do you want directions to hoboken,no im looking for information about the websit...,


In [182]:
hist_df[hist_df['history_list'].notna()]

Unnamed: 0,history_id,query,question,answer,history_list
0,1-1,obama family tree,are you interested in seeing barack obamas family,yes am interested in obamas family,
1,1-2,obama family tree,are you interested in seeing barack obamas family,no i want to know where obamas parents and gra...,
2,1-3,obama family tree,are you interested in seeing barack obamas family,yes please find me information on barack obama...,
3,1-1,obama family tree,are you looking for biological information on ...,no i need the specific time magazine article t...,
4,1-2,obama family tree,are you looking for biological information on ...,yes specifically information about his parents...,
...,...,...,...,...,...
922305,200-4-15-17,ontario california airport,would you like to hear nes about ontario calif...,no just the address,[Q] do you need directions to ontario californ...
922306,200-1-16-17,ontario california airport,would you like to hear nes about ontario calif...,no i am seeking flight information for the air...,[Q] would you like to book a flight at ontario...
922307,200-2-16-17,ontario california airport,would you like to hear nes about ontario calif...,no i am interested in hotels located near the ...,[Q] would you like to book a flight at ontario...
922308,200-3-16-17,ontario california airport,would you like to hear nes about ontario calif...,no,[Q] would you like to book a flight at ontario...


In [None]:
print(history.keys())

In [12]:
# Inspect Qulac
data_df = pd.DataFrame.from_dict(data)
# print(f"data:\n{data_df.head()}")
print(f"data cols: {data_df.columns}")
# print(f"data:\n{data_df.iloc[50:70]}\n")
# print(f"data:\n{data_df[data_df['question'] == data_df.iloc[50]['question']]}")
print(f"data:\n{data_df[data_df['topic_type'] == 'ambiguous'].iloc[276]}")

data cols: Index(['topic_id', 'facet_id', 'topic_facet_id', 'topic_facet_question_id',
       'topic', 'topic_type', 'facet_type', 'topic_desc', 'facet_desc',
       'question', 'answer'],
      dtype='object')
data:
topic_id                                                                 142
facet_id                                                                   1
topic_facet_id                                                         142-1
topic_facet_question_id                                              142-1-5
topic                                                     illinois state tax
topic_type                                                         ambiguous
facet_type                                                               inf
topic_desc                 information about the sales tax in Illinois: w...
facet_desc                 information about the sales tax in Illinois: w...
question                   do you want to know what the illinois state ta...
answer       

In [10]:
b = data_df[100:102]

NameError: name 'data_df' is not defined

In [60]:
cheap = data_df[data_df['topic'] == 'cheap internet']
# cheap['facet_desc'].unique()
# cheap_facet = cheap[cheap['facet_desc'] == 'What are some low-cost broadband internet providers?']
# cheap_facet['answer'].unique()
# len(cheap_facet)

In [72]:
cheap[cheap['question'] == "are you wondering who used cheap internet"]

Unnamed: 0,topic_id,facet_id,topic_facet_id,topic_facet_question_id,topic,topic_type,facet_type,topic_desc,facet_desc,question,answer
40,10,1,10-1,10-1-10,cheap internet,faceted,inf,I\'m looking for cheap (i.e. low-cost) interne...,What are some low-cost broadband internet prov...,are you wondering who used cheap internet,no i want to know the lowcost broadband isps
55,10,2,10-2,10-2-10,cheap internet,faceted,inf,I\'m looking for cheap (i.e. low-cost) interne...,Do any internet providers still sell dial-up?,are you wondering who used cheap internet,no can i still get dialup access
70,10,3,10-3,10-3-10,cheap internet,faceted,inf,I\'m looking for cheap (i.e. low-cost) interne...,Who can provide inexpensive digital cable tele...,are you wondering who used cheap internet,no i am interested in which companies offer ch...
85,10,5,10-5,10-5-10,cheap internet,faceted,inf,I\'m looking for cheap (i.e. low-cost) interne...,Find me some providers of free wireless intern...,are you wondering who used cheap internet,no i want providers of free wireless
100,10,6,10-6,10-6-10,cheap internet,faceted,inf,I\'m looking for cheap (i.e. low-cost) interne...,I want to find cheap DSL providers.,are you wondering who used cheap internet,no i was wondering who the cheapest dsl provid...
115,10,7,10-7,10-7-10,cheap internet,faceted,inf,I\'m looking for cheap (i.e. low-cost) interne...,Is there a way to get internet access without ...,are you wondering who used cheap internet,no i want internet package excluding phone ser...


In [89]:
def process_qulac_main(data_df):
    """
    Process Qulac main dataset into a dictionary grouped by topic and facet.
    
    {topic:{
        "facets":{
            facet:{
                "questions":{
                    question:[answer list]
                }
            }
        }
    }}
    """
    structured_data = {}

    for _, row in data_df.iterrows():
        topic = row["topic"]
        facet_desc = row["facet_desc"]
        question = row["question"]
        answer = row["answer"]

        if question == "" or answer == "":
            continue

        # Initialise topic if doesn't exist
        if topic not in structured_data:
            structured_data[topic] = {"facets": {}}

        # Initialise facet if doesn't exist
        if facet_desc not in structured_data[topic]["facets"]:
            structured_data[topic]["facets"][facet_desc] = {"questions": {}}

        # Initialise question if doesn't exist
        if question not in structured_data[topic]["facets"][facet_desc]["questions"]:
            structured_data[topic]["facets"][facet_desc]["questions"][question] = []

        # Append answer
        structured_data[topic]["facets"][facet_desc]["questions"][question].append(answer)

    return structured_data

In [90]:
# Process qulac main dataset into structured dictionary
qulac_structured_dict = process_qulac_main(data_df)

In [91]:
# View contents of structured dictionary
print(json.dumps(qulac_structured_dict, indent=6))

{
      "obama family tree": {
            "facets": {
                  "Find the TIME magazine photo essay \\\"Barack Obama\\'s Family Tree\\\".": {
                        "questions": {
                              "are you interested in seeing barack obamas family": [
                                    "yes am interested in obamas family"
                              ],
                              "would you like to know barack obamas geneology": [
                                    "yes i want to know who made up his family"
                              ],
                              "would you like to know about obamas ancestors": [
                                    "yes this is what am looking for"
                              ],
                              "would you like to know who is currently alive from president obamas family tree": [
                                    "no find the barack obamas family tree time magazine photo essay"
                       

In [None]:
a = data_df[39:42]
b = data_df[100:102]
b
# con = pd.concat([a,b])

# print(con)

In [None]:
# Inspect history
# <topic/query_id>-<facet_id>-*<positional_question_variant>-<instance(seems to always be 1)>
for i, (key, value) in enumerate(history.items()):
    if not re.search("^18-\d-1-", key):
        continue
    print(f"record id: {key}")
    for k, v in value.items():
        print(f"{k}: {v}")

    print()

In [119]:
i = 0
for record in history.values():
    if i > 5:
        break
    print(record)
    i += 1

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



{'history_id': '200-2-3-8', 'query': 'ontario california airport', 'question': 'do you need directions to ontario california airport', 'answer': 'no just tell me which hotels are the closest to the airport', 'history_list': [{'question': 'do you mean an airport in canada', 'answer': 'yes the ontario canada airport'}, {'question': 'what ammendities are you looking for in your airport', 'answer': 'i want to know what hotels are near'}]}
{'history_id': '200-3-3-8', 'query': 'ontario california airport', 'question': 'do you need directions to ontario california airport', 'answer': 'no just what they offer', 'history_list': [{'question': 'do you mean an airport in canada', 'answer': 'no the one in california'}, {'question': 'what ammendities are you looking for in your airport', 'answer': 'i want informaion on the ontario ca airport ammendities'}]}
{'history_id': '200-4-3-8', 'query': 'ontario california airport', 'question': 'do you need directions to ontario california airport', 'answer':