In [None]:
#sources = "https://zen10.com.au/melbourne-suburb-list/"
#"https://data.melbourne.vic.gov.au/explore/dataset/street-names/information/"

In [2]:
import pandas as pd

file_path = r'C:\Users\logan\Desktop\Uni\Team proj\Mapping\street-names.csv'
df = pd.read_csv(file_path)

#Extract the street names column
street_names = df['name'].unique()

#Clean the street names to remove extra spaces or format issues
cleaned_street_names = [name.strip() for name in street_names if pd.notnull(name)]

#Save the street names to a text file
output_file_path = 'extracted_street_names.txt'
with open(output_file_path, 'w') as f:
    for street in cleaned_street_names:
        f.write(f"{street}\n")

output_file_path


'extracted_street_names.txt'

In [15]:
import random
import pandas as pd

#Load street names and suburb names
with open('extracted_street_names.txt', 'r') as f:
    street_names = [line.strip() for line in f.readlines()]

with open('melbourne_suburbs_list.txt', 'r') as f:
    suburb_names = [line.strip() for line in f.readlines()]

#templates for indicating current location
current_location_templates = [
    "I am currently at {street_number} {street}.",
    "I'm at {street_number} {street}, {suburb}.",
    "I am here at {street_number} {street}.",
    "Currently, I am on {street}.",
    "I am at {street_number} {street} in {suburb}.",
    "I'm on {street} in {suburb}.",
    "I am here in {suburb}.",
    "I’m at {street_number} {street}."
]

#templates for indicating destination
destination_templates = [
    "I need to get to {street_number} {street}.",
    "I need directions to {street} in {suburb}.",
    "Can you guide me to {street_number} {street}, {suburb}?",
    "How do I get to {street_number} {street}?",
    "I need to go to {suburb}.",
    "I'm heading towards {street_number} {street} in {suburb}.",
    "I am trying to reach {street}.",
    "I need to get here: {street_number} {street}.",
    "Where is {street_number} {street} located?"
]

#Function to generate random sentences with street numbers and annotations
def generate_annotated_data(street_names, suburb_names, current_templates, destination_templates, num_samples=1500):
    annotated_data = []
    for _ in range(num_samples):
        #Randomly choose to generate a current location or destination sentence
        if random.choice([True, False]):
            #Generate current location sentence
            template = random.choice(current_templates)
        else:
            #Generate destination sentence
            template = random.choice(destination_templates)

        #Determine if placeholders exist in template
        include_street_number = '{street_number}' in template
        include_street = '{street}' in template
        include_suburb = '{suburb}' in template

        #Generate random values for placeholders
        street_number = random.randint(1, 1000) if include_street_number else ''
        street = random.choice(street_names) if include_street else ''
        suburb = random.choice(suburb_names) if include_suburb else ''

        #Format the sentence
        sentence = template.format(street_number=street_number, street=street, suburb=suburb)
        
        #Create entity annotations
        entities = []
        if street:
            start = sentence.index(street)
            entities.append((start, start + len(street), "STREET_NAME"))
        if suburb:
            start = sentence.index(suburb)
            entities.append((start, start + len(suburb), "SUBURB"))
        if street_number:
            start = sentence.index(str(street_number))
            entities.append((start, start + len(str(street_number)), "STREET_NUMBER"))
        
        annotated_data.append((sentence, entities))
    
    return annotated_data

#Generate annotated data and convert to df and save
annotated_data = generate_annotated_data(street_names, suburb_names, current_location_templates, destination_templates, num_samples=1500)

annotated_df = pd.DataFrame(annotated_data, columns=["sentence", "entities"])

output_csv_path = 'annotated_sentences.csv'
annotated_df.to_csv(output_csv_path, index=False)

output_csv_path


'annotated_sentences.csv'

In [16]:
import spacy
from spacy.training import Example
import pandas as pd
import random

nlp = spacy.load("en_core_web_sm")

#Access the NER component of the pipeline
ner = nlp.get_pipe("ner")

#Add new labels to the NER component
for label in ["STREET_NAME", "SUBURB", "STREET_NUMBER"]:
    if label not in ner.labels:
        ner.add_label(label)

#Load training data from CSV
training_data = []
df = pd.read_csv('annotated_sentences.csv')
for _, row in df.iterrows():
    sentence = row['sentence']
    entities = eval(row['entities']) 
    annotations = {"entities": entities}
    training_data.append((sentence, annotations))

#Convert the training data to SpaCy examples
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in training_data]

#Training
optimizer = nlp.resume_training()
for epoch in range(10):  
    random.shuffle(examples)
    losses = {}
    for batch in spacy.util.minibatch(examples, size=8):
        nlp.update(batch, drop=0.5, sgd=optimizer, losses=losses)
    print(f"Epoch {epoch + 1}, Losses: {losses}")

#Save the model
nlp.to_disk("fine_tuned_ner_model")

#Test the model
nlp_test = spacy.load("fine_tuned_ner_model")
doc = nlp_test("I need directions to 250 Flinders Street, Richmond.")
for ent in doc.ents:
    print(ent.text, ent.label_)




Epoch 1, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 1989.923194047918}
Epoch 2, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 86.93376884912736}
Epoch 3, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 30.376093826680638}
Epoch 4, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 30.54763240443067}
Epoch 5, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 12.854508668442213}
Epoch 6, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 14.610203881869209}
Epoch 7, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 9.536862214620959}
Epoch 8, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 3.9137060054542694}
Epoch 9, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 0.9962643049841222}
Epoch 10, Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 5.2389285653010855}
250 STREET_NUMBER
Flinders Street SUBURB
Richmond SUBURB


In [17]:
#Validating output
validation_data = [
    ("I am at 300 Collins Street in Melbourne.", {"entities": [(9, 12, "STREET_NUMBER"), (13, 27, "STREET_NAME"), (31, 40, "SUBURB")]}),
    ("Can you guide me to 450 Bourke Street?", {"entities": [(18, 21, "STREET_NUMBER"), (22, 35, "STREET_NAME")]}),
]

#Convert validation data to spacy examples
validation_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in validation_data]

#Evaluate
nlp.evaluate(validation_examples)




{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'tag_acc': None,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'lemma_acc': None,
 'ents_p': None,
 'ents_r': None,
 'ents_f': None,
 'ents_per_type': None,
 'speed': 2239.502332878182}

In [18]:
# Evaluate the model and get metrics
results = nlp.evaluate(validation_examples)

print("Precision:", results["ents_p"])
print("Recall:", results["ents_r"])
print("F1 Score:", results["ents_f"])
print("Accuracy:", results["token_acc"])


Precision: None
Recall: None
F1 Score: None
Accuracy: 1.0
