In [1]:
import random
import pandas as pd
import types
import re
from dateutil.parser import parse

Dict_query_type_mapping={
    "tsnchange:County":"COUNTY",
    "time:Interval":"DATE",
    "geo:Geometry":"GEOMETRY",
    "tsnchange:Change":"CHANGE",
    "sem:Event":"EVENT",
    'tsnchange:State':"STATE"
}

Dict_instances_type_mapping={
    "tsnchange:County":"COUNTY_NAME",
    "time:Interval":"DATE_NAME",
    'tsnchange:State':"STATE_NAME",
    "tsnchange:Change":"CHANGE_TYPE"
}

In [2]:

data = pd.read_json("./queries_with_results2.json")
pd.options.display.max_colwidth = 500
print(data.head())

  class_changes filters  leaf_nodes  \
0            []      []       [?x0]   
1            []      []       [?x2]   
2            []      []  [?x0, ?x2]   
3            []      []       [?x0]   
4            []      []  [?x2, ?x3]   

                                                                                                        node_types  \
0                            {'?x0': 'tsnchange:County', '?x1': 'tsnchange:CountyVersion', '?x2': 'time:Interval'}   
1                             {'?x0': 'tsnchange:County', '?x1': 'tsnchange:CountyVersion', '?x2': 'geo:Geometry'}   
2     {'?x0': 'tsnchange:County', '?x1': 'tsnchange:CountyVersion', '?x2': 'time:Interval', '?x3': 'geo:Geometry'}   
3                         {'?x0': 'tsnchange:County', '?x1': 'tsnchange:CountyVersion', '?x2': 'tsnchange:Change'}   
4  {'?x0': 'tsnchange:County', '?x1': 'tsnchange:CountyVersion', '?x2': 'geo:Geometry', '?x3': 'tsnchange:Change'}   

                                                        

In [3]:

def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

In [4]:
class queries_data:
    def __init__(self):
        self.processed_df = pd.DataFrame(data = None, 
                               columns = ['Q_id','Instances','Query_type', 'Predicates', "Filters"],
                              )
    
    def extract_query_type(self, leaf_nodes, node_types):
        type_list=[node_types[x] for x in leaf_nodes]
        return type_list[0]

    def uri_2_name(self, uri):
        return uri.split("/")[-1].title().replace("_"," ")

    def extract_instances(self, row):
        instance_list=[] #Needs to be a list because you are going to have many instances
        #Now Simon has only one instance
        d={}
        d["type"]=row["seed_node_type"]
        d_value=row["uri_changes"][0]["uri"]
        if d['type']=="tsnchange:County" or d['type']=="tsnchange:State":
            d_value=self.uri_2_name(d_value)
        d['value']=d_value
        instance_list.append(d)
        return instance_list
    
    def extract_predicates(self, row):
        predicates=[]
        for x in row["query_graph"]["edges"]:
            predicates.append(x[1])
        return predicates

    def extract_filter(self, row):
        filters=[]
        for f in row:
            d={}
            f_type=f["filter_type"]
            if f_type=="temporal":
                components=f["sparql_line"].replace('(', '').replace(')', '').split(' ')[1:4]
                if components[0][0]=='?' and components[2][0]=='?':
                    d["type"]=f_type
                else:
                    d["type"]="temporal_date"
                d['f_relation']=f["temporal_relation"]        
            elif f_type=="spatial":
                d["type"]=f["filter_type"]
                components=f["sparql_line"].replace('(', ' ').replace(')', ' ').split(' ')[1:4]
                d['f_relation']=components[0]

            d['Filter_components']=components
            
            filters.append(d)
        return filters 

    def extract_data(self, original_dataframe):
        print(original_dataframe.columns)
        self.processed_df['Q_id']= original_dataframe["rule"].apply(lambda x: x["id"])
        self.processed_df['Query_type']= original_dataframe.apply(lambda x: self.extract_query_type(x[2],x[3]), axis=1)
        self.processed_df['Instances']= original_dataframe.apply(lambda row: self.extract_instances(row),axis=1)
        self.processed_df['Predicates']= original_dataframe.apply(lambda row: self.extract_predicates(row),axis=1)
        self.processed_df['Filters']= original_dataframe.apply(lambda row: self.extract_filter(row["filters"]),axis=1)
        # print(original_dataframe.head())
        print(self.processed_df.head())
        return self.processed_df

In [5]:


q_d=queries_data()

queries_df=q_d.extract_data(data)

print(queries_df.Query_type.unique())

Index(['class_changes', 'filters', 'leaf_nodes', 'node_types', 'placeholders',
       'query_graph', 'results', 'rule', 'rule_graph', 'rule_graph_edges',
       'seed_node', 'seed_node_type', 'sparql_queries', 'time_diff',
       'uri_changes', 'within_nodes'],
      dtype='object')
  Q_id  \
0   Q1   
1   Q2   
2   Q3   
3   Q4   
4   Q7   

                                                                                                          Instances  \
0              [{'type': 'time:Interval', 'value': 'http://time-space-event.com/resource/county/interval/id/7060'}]   
1                                                               [{'type': 'tsnchange:County', 'value': 'Tx Nca 5'}]   
2  [{'type': 'geo:Geometry', 'value': 'http://time-space-event.com/resource/Alabama/County/MORGAN_3/Geometry/646'}]   
3      [{'type': 'tsnchange:Change', 'value': 'http://time-space-event.com/resource/South_Carolina/change_YORK_3'}]   
4                                                           

In [6]:
question_data = pd.read_csv("./Question_templatesV5.csv")
print(question_data.head())

  Q_type  Answer  \
0     Q1  COUNTY   
1     Q1  COUNTY   
2     Q1  COUNTY   
3     Q1  COUNTY   
4     Q1  COUNTY   

                                               Questions Template  \
0                                        Return all the counties.   
1           Give the names of the counties that have been formed.   
2           Name every county which has been founded until today.   
3  List all the counties which have been created through history.   
4                            Which counties do not exist anymore?   

   COUNTY_NAME  DATE_NAME  COUNTY_RELATION  CHANGE_TYPE  COUNTY_NAME_2  \
0        False      False            False        False          False   
1        False      False            False        False          False   
2        False      False            False        False          False   
3        False      False            False        False          False   
4        False      False            False        False          False   

   STATE_NAME  TEMP

In [7]:

class question_template_picker:
    def __init__(self, question_templates):
        self.question_df = pd.DataFrame(data = None, 
                               columns = ['Questions'],
                              )
        self.question_templates=question_templates

    def predicate_filtering(self, questions_temp, row):
        if "tsnchange:countyVersionAfter" in row["Predicates"]:
            questions_temp=questions_temp[questions_temp["PREDICATE"]=="AFTER"]
        elif "tsnchange:countyVersionBefore" in row["Predicates"]:
            questions_temp=questions_temp[questions_temp["PREDICATE"]=="BEFORE"]    
        return questions_temp

    def filtering_question_templates(self, inst_type, row):
        questions_temp=self.question_templates[self.question_templates["Q_type"]==row["Q_id"]]
        questions_temp=questions_temp[questions_temp["Answer"]==Dict_query_type_mapping[row["Query_type"]]]
        questions_temp=questions_temp[questions_temp[Dict_instances_type_mapping[inst_type]]]
        f_list= row['Filters']
        f_types=[]
        if f_list:
            f_types=[d["type"] for d in f_list]

        if "temporal" in f_types:
            questions_temp=questions_temp[questions_temp["TEMP_RELATION"]]
        elif "spatial" in f_types:
            questions_temp=questions_temp[questions_temp["GEO_RELATION"]]
        
        questions_temp=self.predicate_filtering(questions_temp, row)
        return questions_temp  

    def template_picker(self, row):
        I_types=set(Dict_instances_type_mapping.keys())
        inst_list=row["Instances"]
        for i in inst_list:
            inst_type=i["type"]
            if inst_type not in Dict_instances_type_mapping.keys():
                return []
            if inst_type in I_types:
                I_types.remove(inst_type)
            questions_temp=self.filtering_question_templates(inst_type, row)
        for t in I_types:
            questions_temp=questions_temp[~questions_temp[Dict_instances_type_mapping[t]]]
        return list(questions_temp["Questions Template"].values)

    def question_template_production(self, querie_df):
        querie_df["Question_templates"]=querie_df.apply(lambda row: self.template_picker(row), axis=1)
        # print(querie_df["Question_templates"])
        return querie_df


In [8]:
q_t_p=question_template_picker(question_data)

df_question_templates=q_t_p.question_template_production(queries_df)
df_question_templates=df_question_templates[df_question_templates.astype(str)["Question_templates"]!='[]']

In [9]:
# df_question_templates.to_csv("./dataset.csv")

In [10]:
class question_production:
    def __init__(self, df_picked_templates):
        self.df_questions=df_picked_templates.copy()


    def replace_variables(self, row):
        if not row["Question_templates"]:
            return " "
        question_temp=random.choice(row["Question_templates"])
        for d in row["Instances"]:
            question=question_temp.replace(Dict_instances_type_mapping[d["type"]],  d["value"])
        return question  

    def produce_questions(self):
        self.df_questions["Questions"]=self.df_questions.apply(lambda row: self.replace_variables(row), axis=1)
        return self.df_questions

In [14]:
from happytransformer import HappyTextToText, TTSettings

happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")

args = TTSettings(num_beams=5, min_length=1,max_length=100,early_stopping=True)

def grammar_correction(question_list):
    corrected_question_list=[]
    print(question_list)
    for text in question_list:
        corrected_text= happy_tt.generate_text("grammar: "+text, args=args).text
        if text!=corrected_text:
            corrected_question_list.append(corrected_text)
        else:
            corrected_question_list.append("")
    return corrected_question_list


#result = happy_tt.generate_text("grammar: This sentences has has bads grammar.", args=args)


10/26/2022 14:05:43 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [15]:
q_p=question_production(df_question_templates)
question_df=q_p.produce_questions()

#question_df["Correction"]=question_df["Questions"].apply(lambda x: grammar_correction(x))

question_df.to_csv("./dataset3.csv")

Which counties have been extinct http://time-space-event.com/resource/county/interval/id/7060?




KeyboardInterrupt: 