In [1]:
import spacy
import json
from spacy.training.example import Example
import random


def train_model(nbr_iteration):

    data = None
    with open("training/spaCyFromToTraining.json", "r") as json_file:
        data = json.load(json_file)
        print(f"Data Loaded, {len(data)} objects detected")

    
    if not data:
        return None
    
    nlp = spacy.load("fr_core_news_sm")

    # Check if NER component exists, if not, add it
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    
    # Add your custom labels (e.g., "FROM" and "TO") to the NER component
    ner.add_label("FROM")
    ner.add_label("TO")

    print(ner)

    # Prepare training examples
    training_examples = []
    for example in data:
        entities = example.get("entities", [])
        print(entities)
        doc = nlp.make_doc(example["text"])

        # Create a spaCy Example
        example_dict = {"text": example["text"], "entities": entities}
        print(example_dict)
        example_spacy = Example.from_dict(doc, example_dict)

        training_examples.append(example_spacy)

    # Train the model
    for _ in range(nbr_iteration):
        random.shuffle(training_examples)
        for example in training_examples:
            nlp.update([example], drop=0.5)

    # Save the trained model
    output_dir = "./models/"
    nlp.to_disk(output_dir)
    print("Model trained and saved to:", output_dir)


train_model(50)

Data Loaded, 50 objects detected
<spacy.pipeline.ner.EntityRecognizer object at 0x12f595070>


AttributeError: 'list' object has no attribute 'get'

In [2]:
import spacy
import time
from spacy.training.example import Example
import json

# Load training data
with open("training/spaCyFromToTraining.json", "r") as json_file:
   training_data = json.load(json_file)

# Load validation data
with open("validation/spaCyFromToValidation.json", "r") as json_file:
   validation_data = json.load(json_file)

# Load the model
nlp = spacy.load("fr_core_news_sm")

# Initialize the NER component
ner = nlp.get_pipe("ner")

# Add labels to the NER component
for _, annotations in training_data:
   for ent in annotations.get("entities"):
       ner.add_label(ent[2])

# Function to check if two entities match
def entity_match(pred_text, pred_start, pred_end, pred_label, true_start, true_end, true_label):
   return abs(pred_start - true_start) <= 2 and \
          abs(pred_end - true_end) <= 2 and \
          pred_label == true_label

# Train the model by epoch
for i in range(20): # Change the number of epochs according to your needs
   start_time = time.time()
   for text, annotations in training_data:
       # Create a spaCy Example
       example = Example.from_dict(nlp.make_doc(text), annotations)
       # Update the model with the training example
       nlp.update([example], drop=0.5)
   elapsed_time = time.time() - start_time

   # Evaluate the model on the validation set
   correct_predictions = 0
   total_predictions = 0
   for text, annotations in validation_data:
       doc = nlp(text)
       predicted_entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
       true_entities = annotations["entities"]
       for pred_text, pred_start, pred_end, pred_label in predicted_entities:
           total_predictions += 1
           if any(entity_match(pred_text, pred_start, pred_end, pred_label, true[0], true[1], true[2]) for true in true_entities):
               correct_predictions += 1
   accuracy = correct_predictions / total_predictions

   print(f"Epoch: {i}, Accuracy: {accuracy}, Time: {elapsed_time} seconds")

# Save the updated model
nlp.to_disk("models/test/")


Epoch: 0, Accuracy: 0.3888888888888889, Time: 0.8889641761779785 seconds
Epoch: 1, Accuracy: 0.4148936170212766, Time: 0.8719291687011719 seconds
Epoch: 2, Accuracy: 0.5824175824175825, Time: 0.8925278186798096 seconds
Epoch: 3, Accuracy: 0.643979057591623, Time: 0.9625580310821533 seconds
Epoch: 4, Accuracy: 0.6666666666666666, Time: 0.881091833114624 seconds
Epoch: 5, Accuracy: 0.7171717171717171, Time: 0.885673999786377 seconds
Epoch: 6, Accuracy: 0.7487437185929648, Time: 0.8825361728668213 seconds
Epoch: 7, Accuracy: 0.7688442211055276, Time: 0.8674800395965576 seconds
Epoch: 8, Accuracy: 0.7766497461928934, Time: 0.910210132598877 seconds
Epoch: 9, Accuracy: 0.7076923076923077, Time: 0.8641369342803955 seconds
Epoch: 10, Accuracy: 0.7738693467336684, Time: 0.9320383071899414 seconds
Epoch: 11, Accuracy: 0.7788944723618091, Time: 0.8986010551452637 seconds
Epoch: 12, Accuracy: 0.795, Time: 0.8798091411590576 seconds
Epoch: 13, Accuracy: 0.8080808080808081, Time: 0.8794200420379639

In [None]:
from spacy.training.example import Example

n_iter = 10
for _ in range(n_iter):
    for example in training_data:
        doc = nlp.make_doc(example["text"])
        example_spacy = Example.from_dict(doc, example)

        # Update the model with the training example
        nlp.update([example_spacy], drop=0.5)

In [4]:
import spacy

# Load the trained spaCy model
nlp = spacy.load("models/test")

# Example text for prediction
text_to_predict = "J'aime les pommes de amiens et je voudrais partir de paris pour aller à marseille"

# Process the text using the loaded model
doc = nlp(text_to_predict.lower())

# Extract named entities from the processed document
entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

# Print the extracted entities
print("Extracted Entities:")
for entity in entities:
    print(entity)

Extracted Entities:
('amiens', 21, 27, 'FROM')
('paris', 53, 58, 'FROM')
('marseille', 72, 81, 'TO')


In [63]:
from faker import Faker
import random
import json
import pandas as pd
fake = Faker("fr_FR")  # French locale
df = pd.read_csv("./format_src/format_data_station_city.csv", sep=";")
cities = df["City"].tolist()
def generate_training_data(nbr_data: int):
    training_data = []

    for _ in range(nbr_data):
        from_city = random.choice(cities)
        to_city = random.choice(cities)

        # Ensure unique cities
        while to_city == from_city:
            to_city = random.choice(cities)

        if random.choice([True, False]):
            from_city, to_city = to_city, from_city
        
        is_order_1 = random.choice([True, False, True])

        sentence_template_order_1 = random.choice([
            "Je veux voyager de {} à {}.",
            "il faut que j'aille à {}, je suis à {}",
            "je dois faire le trajet {} {}",
            "{} {}",
            "je suis à {} et j'aimerai allez à {}",
            "de {} comment se rendre à {}",
            "je suis perdu, je pense etre pas loin de {} et je dois allez à {}",
            "depuis hier je pense allez à {}, mais j'ai peur du trajet depuis {}",
            "gare de {} gare de {}"
            

        ])

        sentence_template_order_2 = random.choice([
            "j'aimme les Pommes de Amiens, mais je vais à {} de {}",
            "j'adore {}, comment y allez de {}",
            "je dois me de rendre à {} mais je suis à {}",
            "c'est super {}, comment y allez de {}",
            "je suis pas loin de Paris mais je dois allez à {} depuis {}"

        ])


        sentence = sentence_template_order_1.format(from_city, to_city) if is_order_1 else sentence_template_order_2.format(to_city, from_city)
        
        
        start_from = sentence.find(from_city)
        start_to = sentence.find(to_city)

        entities = [
                (start_from, start_from + len(from_city), "FROM"),
                (start_to, start_to + len(to_city), "TO")
        ]
        training_data.append((sentence, {"entities": entities}))

    return training_data


training_data = generate_training_data(100)

with open("training/spaCyFromToTraining.json", "w+") as file:
    json.dump(training_data, file)






In [1]:
def generate_validation_data(nbr_data: int):
    validation_data = []

    df = pd.read_csv("./format_src/format_data_station_city.csv", sep=";")
    cities = df["City"].tolist()
    for _ in range(nbr_data):
        from_city = random.choice(cities)
        to_city = random.choice(cities)

        # Ensure unique cities
        while to_city == from_city:
            to_city = random.choice(cities)

        if random.choice([True, False]):
            from_city, to_city = to_city, from_city
        
        is_order_1 = random.choice([True, False, True])

        sentence_template_order_1 = random.choice([
            "Je veux voyager de {} à {}.",
            "il faut que j'aille à {}, je suis à {}",
            "je dois faire le trajet {} {}",
            "{} {}",
            "je suis à {} et j'aimerai allez à {}",
            "de {} comment se rendre à {}",
            "je suis perdu, je pense etre pas loin de {} et je dois allez à {}",
            "depuis hier je pense allez à {}, mais j'ai peur du trajet depuis {}",
            "gare de {} gare de {}"
            
        ])

        sentence_template_order_2 = random.choice([
            "j'aimme les Pommes de Amiens, mais je vais à {} de {}",
            "j'adore {}, comment y allez de {}",
            "je dois me de rendre à {} mais je suis à {}",
            "c'est super {}, comment y allez de {}",
            "je suis pas loin de Paris mais je dois allez à {} depuis {}"

        ])

        sentence = sentence_template_order_1.format(from_city, to_city) if is_order_1 else sentence_template_order_2.format(to_city, from_city)
        
        start_from = sentence.find(from_city)
        start_to = sentence.find(to_city)

        # Check if the cities are found in the sentence
        if start_from == -1 or start_to == -1:
            continue

        # Check if the indices match the actual positions of the cities in the sentence
        if sentence[start_from:start_from+len(from_city)] != from_city or sentence[start_to:start_to+len(to_city)] != to_city:
            continue

        entities = [
                (start_from, start_from + len(from_city), "FROM"),
                (start_to, start_to + len(to_city), "TO")
        ]
        validation_data.append((sentence, {"entities": entities}))

    return validation_data


validation_data = generate_validation_data(100)

with open("validation/spaCyFromToValidation.json", "w+") as file:
    json.dump(validation_data, file)


NameError: name 'pd' is not defined

In [18]:
import pandas as pd
import numpy


def extract_unique_station_and_cities():
    df = pd.read_csv('./src/liste-des-gares.csv', sep=';')
    

   
    station_list = numpy.array([])
    city_list = numpy.array([])

    for i in range(len(df)):
        row = df.iloc[i]

        # Traitement des données
        station = row[1].lower()
        city = row[7].lower()

        station_list = numpy.append(station_list, station)
        city_list = numpy.append(city_list, city)
    

    df_res = pd.DataFrame({
        "Station": station_list,
        "City": city_list
    })

    df_res.to_csv('./format_src/format_data_station_city.csv', sep=";")

    
    

extract_unique_station_and_cities()

  station = row[1].lower()
  city = row[7].lower()
