In [19]:
import spacy
import json
from spacy.training.example import Example
import random


def train_model(nbr_iteration):

    data = None
    with open("training/spaCyFromToTraining.json", "r") as json_file:
        data = json.load(json_file)
        print(f"Data Loaded, {len(data)} objects detected")

    
    if not data:
        return None
    
    nlp = spacy.load("fr_core_news_sm")

    # Check if NER component exists, if not, add it
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    
    # Add your custom labels (e.g., "FROM" and "TO") to the NER component
    ner.add_label("FROM")
    ner.add_label("TO")

    print(ner)

    # Prepare training examples
    training_examples = []
    for example in data:
        entities = example.get("entities", [])
        print(entities)
        doc = nlp.make_doc(example["text"])

        # Create a spaCy Example
        example_dict = {"text": example["text"], "entities": entities}
        print(example_dict)
        example_spacy = Example.from_dict(doc, example_dict)

        training_examples.append(example_spacy)

    # Train the model
    for _ in range(nbr_iteration):
        random.shuffle(training_examples)
        for example in training_examples:
            nlp.update([example], drop=0.5)

    # Save the trained model
    output_dir = "./models/"
    nlp.to_disk(output_dir)
    print("Model trained and saved to:", output_dir)


train_model(50)

Data Loaded, 50 objects detected
<spacy.pipeline.ner.EntityRecognizer object at 0x0000018D56BA3990>
[{'start': 8, 'end': 16, 'label': 'FROM'}, {'start': 25, 'end': 44, 'label': 'TO'}]
{'text': 'gare de Kleindan gare de Delorme-sur-Lacroix', 'entities': [{'start': 8, 'end': 16, 'label': 'FROM'}, {'start': 25, 'end': 44, 'label': 'TO'}]}


ValueError: [E973] Unexpected type for NER data

In [32]:
import spacy
from spacy.training.example import Example

data = None
with open("training/spaCyFromToTraining.json", "r") as json_file:
    data = json.load(json_file)
    print(f"Data Loaded, {len(data)} objects detected")

nlp = spacy.load("fr_core_news_sm")

# Example training data
training_data = [
    (
        "Apple est une entreprise américaine.",
        {"entities": [(0, 5, "ORG")]}
    ),
    # Add more examples as needed
]

# Process and update the model with training data
for _ in range(100):
    for text, annotations in data:
        # Create a spaCy Example
        example = Example.from_dict(nlp.make_doc(text), annotations)
        # Update the model with the training example
        nlp.update([example], drop=0.5)

# Save the updated model
nlp.to_disk("models/test/")

Data Loaded, 50 objects detected


In [None]:
from spacy.training.example import Example

n_iter = 10
for _ in range(n_iter):
    for example in training_data:
        doc = nlp.make_doc(example["text"])
        example_spacy = Example.from_dict(doc, example)

        # Update the model with the training example
        nlp.update([example_spacy], drop=0.5)

In [43]:
import spacy

# Load the trained spaCy model
nlp = spacy.load("models/test")

# Example text for prediction
text_to_predict = "Je vais à Marseille depuis Nice."

# Process the text using the loaded model
doc = nlp(text_to_predict)

# Extract named entities from the processed document
entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

# Print the extracted entities
print("Extracted Entities:")
for entity in entities:
    print(entity)

Extracted Entities:
('Marseille', 10, 19, 'TO')
('Nice', 27, 31, 'TO')


In [26]:
from faker import Faker
import random
import json

fake = Faker("fr_FR")  # French locale

def generate_training_data(nbr_data: int):
    training_data = []

    for _ in range(nbr_data):
        from_city = fake.city()
        to_city = fake.city()

        # Ensure unique cities
        while to_city == from_city:
            to_city = fake.city()

        if random.choice([True, False]):
            from_city, to_city = to_city, from_city
        
        is_order_1 = random.choice([True, False, True])

        sentence_template_order_1 = random.choice([
            "Je veux voyager de {} à {}.",
            "il faut que j'aille à {}, je suis à {}",
            "je dois faire le trajet {} {}",
            "{} {}",
            "je suis à {} et j'aimerai allez à {}",
            "de {} comment se rendre à {}",
            "je suis perdu, je pense etre pas loin de {} et je dois allez à {}",
            "depuis hier je pense allez à {}, mais j'ai peur du trajet depuis {}",
            "gare de {} gare de {}"
            

        ])

        sentence_template_order_2 = random.choice([
            "j'aimme les Pommes de Amiens, mais je vais à {} de {}",
            "j'adore {}, comment y allez de {}",
            "je dois me de rendre à {} mais je suis à {}",
            "c'est super {}, comment y allez de {}",
            "je suis pas loin de Paris mais je dois allez à {} depuis {}"

        ])


        sentence = sentence_template_order_1.format(from_city, to_city) if is_order_1 else sentence_template_order_2.format(to_city, from_city)
        
        
        start_from = sentence.find(from_city)
        start_to = sentence.find(to_city)

        entities = [
                (start_from, start_from + len(from_city), "FROM"),
                (start_to, start_to + len(to_city), "TO")
        ]
        training_data.append((sentence, {"entities": entities}))

    return training_data


training_data = generate_training_data(50)

with open("training/spaCyFromToTraining.json", "w+") as file:
    json.dump(training_data, file)






In [18]:
import pandas as pd
import numpy


def extract_unique_station_and_cities():
    df = pd.read_csv('./src/liste-des-gares.csv', sep=';')
    

   
    station_list = numpy.array([])
    city_list = numpy.array([])

    for i in range(len(df)):
        row = df.iloc[i]

        # Traitement des données
        station = row[1].lower()
        city = row[7].lower()

        station_list = numpy.append(station_list, station)
        city_list = numpy.append(city_list, city)
    

    df_res = pd.DataFrame({
        "Station": station_list,
        "City": city_list
    })

    df_res.to_csv('./format_src/format_data_station_city.csv', sep=";")

    
    

extract_unique_station_and_cities()

  station = row[1].lower()
  city = row[7].lower()
