In [25]:
from lxml import objectify
from xml.etree import ElementTree as ET
import numpy as np
import random
import json
import pandas as pd
from gpt_geoparser.geoparser import GptGeoparser
from gpt_geoparser.gpt_handler import PromptBuilder
from gpt_geoparser.data import GeoVirusArticle, LGLArticle, Article, Toponym, News2024Article, WikTorArticle
from tqdm import tqdm
import matplotlib.pyplot as plt

# Building a fine-tuning dataset for Llama-3 Geoparser fine tuning
We will construct a dataset using the LgL and GeoVirus datasets with which we will fine-tune a custom Llama-3 model. The model will be tested on the News2024 dataset to assess the accuracy.

In [31]:
geollama_prompt ="""Below is an instruction that describes a task, paired with an input that provides a specfic example which the task should be applied to. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

geoparse_instruction = """Extract all toponyms from the provided text and estimate their geolocations. Include the name of every toponym in the text and its decimal latitude and longitude coordinates. Format the output in JSON, strictly adhering to the specified template. Be very concise and output only the JSON data inside a code block. Do not provide any explanation or reasoning.

JSON Template for output:

{"toponyms": [
        {
          "name": "<string : toponym name exactly as it appears in the text>",
          "latitude": <float : latitude in decimal degrees>,
          "longitude": <float : longitude in decimal degrees>
        },
        // More toponyms from the text can follow
      ]
}
"""

In [6]:
### LGL data
# open the lgl dataset using xml
dataset = 'lgl'

def get_data(dataset):
    xml = ET.parse(f"datasets/{dataset}.xml")
    xml_root = xml.getroot()

    xml_str = ET.tostring(xml_root,method='xml').decode()
    xml_obj = objectify.fromstring(xml_str)
    return xml_obj

def build_ft_data(xml_obj, dataset):
    
    ft_data = []
    if dataset in ['lgl', 'GeoVirus']:
        articles = xml_obj.article
    elif dataset in ['WikToR']:
        articles = xml_obj.page
    for article_xml in articles:
        if dataset=='lgl':
            article = LGLArticle(article_xml)
        elif dataset=='GeoVirus':
            article = GeoVirusArticle(article_xml)
        elif dataset=='WikToR':
            article = WikTorArticle(article_xml)
        text = article.text
        response = {"toponyms":[]}
        for toponym in article.toponyms:
            try:
                response['toponyms'].append({"name":str(toponym.phrase),
                                            "latitude":float(toponym.latitude),
                                            "longitude":float(toponym.longitude)})
            except:
                response['toponyms'].append({"name":str(toponym.phrase),
                                             "latitude":None,
                                             "longitude":None})
        ft_data.append({"instruction":geoparse_instruction,
                        "input":str(text),
                        "response":response})
    
    return ft_data
        
    

In [49]:
lgl_xml = get_data('lgl')
geovirus_xml = get_data('GeoVirus')

lgl_ft_data = build_ft_data(lgl_xml, 'lgl')
geovirus_ft_data = build_ft_data(geovirus_xml, 'GeoVirus')

ft_data = lgl_ft_data + geovirus_ft_data

with open('datasets/fine_tuning/llama3_ft_data.json', 'w') as f:
    json.dump(ft_data, f)

In [54]:
d = get_data('TR-News')

# Building a fine-tuning dataset for RAG based Llama-3 Geoparser
We will construct a dataset using the LgL and GeoVirus datasets with which we will fine-tune a custom Llama-3 model. The model will be tested on the News2024 dataset to assess the accuracy.

In [7]:
lgl_xml = get_data('lgl')
geovirus_xml = get_data('GeoVirus')

ft_articles = []
for article_xml in lgl_xml.article:
    ft_articles.append(LGLArticle(article_xml))
for article_xml in geovirus_xml.article:
    ft_articles.append(GeoVirusArticle(article_xml))

In [33]:
RAG_prompt = """Below is an instruction that describes a task, paired with an input that provides a specfic example which the task should be applied to. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""


In [34]:
geoparse_instruction = """You will be given a piece of text, a toponym found within that text, and a JSON detailing the matched locations when that toponym is searched on OpenStreetMaps. 

Your task is to identify the matched location which is most likely to be the true location of the toponym, given the context of the text.

If the list of matches is empty, or you do not think any match accurately represents the toponym, you are permitted to assign your best estimate for a latitude and longitude. This should be highlighted in your response by setting {"RAG":false}.

Your output should strictly conform to the following tmeplate:

{"name" : <(str) name of toponym as it appears in the text>,
 "latitude": <(float) latitude as it appears in the matched locations>,
 "longitude": <(float) longitude as it appears in the matched locations>,
 "RAG_estimated": <(bool) true if a matched location was used>
}
"""

input_prompt = r"""<text> {} <\text>

<toponym> {} <\toponym>

<matches> {} <\matches>
"""

In [35]:
with open('nominatim_cache.json', 'r') as f:
    cache = json.load(f)

In [36]:
from geopy import distance
from shapely.geometry import Point

In [40]:
ft_data = []

for article in tqdm(ft_articles):
    
    for toponym in article.toponyms:
        if not toponym.latitude:
            continue
        true_point = (float(toponym.latitude), float(toponym.longitude))
        matches = cache[toponym.phrase]
        best_match = None
        best_d = np.inf
        for match in matches:
            match_point = (float(match['lat']), float(match['lon']))
            d = distance.distance(match_point, true_point)
            if d < best_d:
                best_match = match
                best_d = d
        # check if any match was very good:
        if len(matches)==0:
            response = {'name':toponym.phrase,
                        'latitude':toponym.latitude,
                        'longitude':toponym.longitude,
                        'RAG_estimated':False}
            
        elif (best_d.km > 20) and (best_match['addresstype'] not in ['country', 'state', 'county', 'region']):
            response = {'name':toponym.phrase,
                        'latitude':toponym.latitude,
                        'longitude':toponym.longitude,
                        'RAG_estimated':False}
        else:
            response = {'name':toponym.phrase,
                        'latitude':best_match['lat'],
                        'longitude':best_match['lon'],
                        'RAG_estimated':True}
    
        match_info = [{'name':m['name'], 'lat':m['lat'], 'lon':m['lon'], 'address':m['display_name']} for m in matches]
        input = input_prompt.format(article.text, toponym.phrase, match_info)
        
        ft_prompt = RAG_prompt.format(geoparse_instruction, input, response)
        ft_data.append({"instruction":geoparse_instruction,
                        "input":input,
                        "response":str(response)})
               
        
        

100%|██████████| 817/817 [00:07<00:00, 116.59it/s]


In [41]:
# deduplicate
ft_data = [dict(t) for t in {tuple(d.items()) for d in ft_data}]

len(ft_data)

3983

In [42]:
with open('llama3_RAG_geoparsing_ft.json', 'w') as f:
    json.dump(ft_data, f)

# Building a fine-tuning dataset for Llama-3 toponym extraction
We will construct a dataset using the LgL and GeoVirus datasets with which we will fine-tune a custom Llama-3 model. The model will be tested on the News2024 dataset to assess the accuracy.

In [18]:
geoparse_instruction = """You will be given a piece of text which contains some toponyms. Please extract each toponyhm from the text and place it in a python list.

Each toponym should only appear once in the list, even if they occur multiple times in the text. If multiple spellings of the same toponym appear in the text each spelling should be represented in the list.

Please use the following template to structure your response:

{"toponyms":["toponym_1", "toponym_2", "toponym_3",...]}
"""

In [19]:
ft_data = []

for article in tqdm(ft_articles):
    
    toponyms = [str(t.phrase) for t in article.toponyms]
    response = {"toponyms":list(set(toponyms))}
    input = article.text
    ft_data.append({"instruction":geoparse_instruction,
                    "input":input,
                    "response":str(response)})

100%|██████████| 817/817 [00:00<00:00, 45814.57it/s]


In [20]:
with open('llama3_toponym_extraction_ft.json', 'w') as f:
    json.dump(ft_data, f)

In [17]:
ft_data[7]

{'instruction': 'YYou will be given a piece of text which contains some toponyms. Please extract each toponyhm from the text and place it in a python list.\n\nEach toponym should only appear once in the list, even if they occur multiple times in the text. If multiple spellings of the same toponym appear in the text each spelling should be represented in the list.\n\nPlease use the following template to structure your response:\n\n{"toponyms":["toponym_1", "toponym_2", "toponym_3",...]}\n',
 'input': 'Drivers warned to watch out for flooded roads. - Highway 200 east of Mahnomen from Mahnomen County Road 3 to Mahnomen County Road 122 near Twin Lakes - Highway 10 eastbound and westbound east of Perham near Otter Tail County Road 80 - Highway 108 east of Pelican Rapids from Highway 59 to Star Lake - Highway 59 from Otter Tail/Grant county line to north junction Highway 55 - Highway 55 from Grant/Wilkin county line to County Road 43 - Highway 114 at Douglas County Road 4 at the south end of