In [1]:
import pandas as pd
import unicodedata
import re
from datetime import datetime
import spacy

In [2]:
df=pd.read_csv("/home/sakkarouis/Downloads/ner_description_dataset.csv")

In [3]:
df

Unnamed: 0,text
0,"Je suis Honor√© Allard, n√©(e) le 30 January 200..."
1,"Je suis Roland Teixeira, n√©(e) le 17 June 1995..."
2,"Je suis Fran√ßoise Gosselin, n√©(e) le 15 August..."
3,"Je suis Olivier Seguin, n√©(e) le 18 October 19..."
4,"Je suis Alfred Lopes-Dupont, n√©(e) le 22 March..."
...,...
295,"Je suis Anouk Roche, n√©(e) le 20 March 2001, √©..."
296,"Je suis No√©mi-Anne Buisson, n√©(e) le 8 Februar..."
297,"Je suis Matthieu Leblanc, n√©(e) le 19 February..."
298,"Je suis Hortense Robert, n√©(e) le 08/03/1995, ..."


## EDA & DATA CLEANING 

In [4]:
def normalize_text(text):
    # enlever les accents
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )
    # supprimer les espaces multiples
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
df['text_clear'] = df['text'].apply(normalize_text)

In [6]:
df.tail()

Unnamed: 0,text,text_clear
295,"Je suis Anouk Roche, n√©(e) le 20 March 2001, √©...","Je suis Anouk Roche, ne(e) le 20 March 2001, e..."
296,"Je suis No√©mi-Anne Buisson, n√©(e) le 8 Februar...","Je suis Noemi-Anne Buisson, ne(e) le 8 Februar..."
297,"Je suis Matthieu Leblanc, n√©(e) le 19 February...","Je suis Matthieu Leblanc, ne(e) le 19 February..."
298,"Je suis Hortense Robert, n√©(e) le 08/03/1995, ...","Je suis Hortense Robert, ne(e) le 08/03/1995, ..."
299,"Je suis Vincent Vidal, n√©(e) le 19/03/1996, √©t...","Je suis Vincent Vidal, ne(e) le 19/03/1996, et..."


In [28]:
def normalize_date(text):
    months_en_fr = {
        "january":"01", "jan":"01", "janvier":"01",
        "february":"02", "feb":"02","fevrier":"02",
        "march":"03", "mar":"03","mars":"03",
        "april":"04", "apr":"04","avril":"04",
        "may":"05","mai":"05",
        "june":"06", "jun":"06","juin":"06",
        "july":"07", "jul":"07","juillet":"07",
        "august":"08", "aug":"08","aout":"08",
        "september":"09", "sep":"09","septembre":"09",
        "october":"10", "oct":"10","octobre":"10",
        "november":"11", "nov":"11","novembre":"11",
        "december":"12", "dec":"12","decembre":"12"
    }
    
    def replacer(match):
        day, month, year = match.groups()
        # Si month est num√©rique (02/01/2001)
        if month.isdigit():
            month_num = f"{int(month):02d}"
        else:
            month_num = months_en_fr.get(month.lower(), None)
        return f"{int(day):02d}-{month_num}-{year}"
    
    # 1Ô∏è‚É£ jj/mm/aaaa
    pattern1 = re.compile(r"(\d{1,2})/(\d{1,2})/(\d{4})")
    text = pattern1.sub(replacer, text)
    
    # 2Ô∏è‚É£ jj Month yyyy
    pattern2 = re.compile(r"(\d{1,2})\s([a-zA-Z]+)\s(\d{4})")
    text = pattern2.sub(replacer, text)
    
    return text

In [29]:
df['text_clear'] = df['text_clear'].apply(normalize_date)

In [30]:
df['text_clear']

0      Je suis Honore Allard, ne(e) le 30-01-2005, et...
1      Je suis Roland Teixeira, ne(e) le 17-06-1995, ...
2      Je suis Francoise Gosselin, ne(e) le 15-08-199...
3      Je suis Olivier Seguin, ne(e) le 18-10-1995, e...
4      Je suis Alfred Lopes-Dupont, ne(e) le 22-03-19...
                             ...                        
295    Je suis Anouk Roche, ne(e) le 20-03-2001, etud...
296    Je suis Noemi-Anne Buisson, ne(e) le 08-02-200...
297    Je suis Matthieu Leblanc, ne(e) le 19-02-1995,...
298    Je suis Hortense Robert, ne(e) le 08-03-1995, ...
299    Je suis Vincent Vidal, ne(e) le 19-03-1996, et...
Name: text_clear, Length: 300, dtype: object

In [None]:
!pip install spacy
!python -m spacy download fr_core_news_lg

Collecting fr-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.8.0/fr_core_news_lg-3.8.0-py3-none-any.whl (571.8 MB)
[2K     [38;2;249;38;114m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[38;2;249;38;114m‚ï∏[0m[38;5;237m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m95.4/571.8 MB[0m [31m1.7 MB/s[0m eta [36m0:04:42[0m

## NER using (SPACY)

In [10]:
nlp = spacy.load("fr_core_news_lg")

In [11]:
#text = "Je suis William Hernandez, n√© le 06-10-1998, √©tudiant √† IHEC Sousse, r√©sidant √† Sousse."
#doc = nlp(text)

In [12]:
#doc.ents

In [13]:
#for ent in doc.ents:
 #   print(ent.text, ent.label_)

In [11]:
# Fonction pour extraire les entit√©s
def extract_entities(text):
    doc = nlp(text)
    entities = {"PER": [], "LOC": [], "ORG": []}
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
    return entities

# Appliquer sur tout le dataset
df['entities'] = df['text_clear'].apply(extract_entities)

# Voir un exemple
print(df[['text_clear', 'entities']].head(1))

                                          text_clear  \
0  Je suis Honore Allard, ne(e) le 30-01-2005, et...   

                                            entities  
0  {'PER': ['Honore Allard'], 'LOC': ['Bizerte'],...  


In [12]:
def extract_email(text):
    match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    return match.group(0) if match else "unknown"

def extract_phone(text):
    match = re.search(r"\+?\d[\d\s]{7,}", text)
    return match.group(0) if match else "unknown"

def extract_birthdate(text):
    pattern = r"(\d{2}-\d{2}-\d{4})"
def extract_location_regex(text):
    # Cherche apr√®s 'r√©sidant', 'habite'
    match = re.search(r"(?:r√©sidant(?:e)?|habite)\s+√†\s+([A-Z][\w\s\-']+)", text)
    if match:
        return match.group(1)
    return None
    
    match = re.search(pattern, text)
    return match.group(0) 
   


In [13]:
def extract_university_regex(text):
    # Cherche apr√®s '√©tudiant', '√©tudier', 'inscrit'
    match = re.search(r"(?:√©tudiant(?:e)?|√©tudier|inscrit(?:e)?)\s+(?:√†|en)\s+([A-Z][\w\s\-']+)", text)
    if match:
        return match.group(1)
    return None


In [14]:
def extract_location_regex(text):
    # Cherche apr√®s 'r√©sidant', 'habite'
    match = re.search(r"(?:r√©sidant(?:e)?|habite)\s+√†\s+([A-Z][\w\s\-']+)", text)
    if match:
        return match.group(1)
    return None


In [26]:
def build_profile(text):
    # spaCy NER
    doc = nlp(text)

    profile = {"name": "unknown", "location": "unknown", 
               "university": "unknown",
               "email": "unknown", "phone": "unknown",
               "birthdate": "unknown"}

    # Liste de mots invalides pour le nom
    invalid_person_words = ["etudiant", "etudiante", "professeur", 
                            "ing√©nieur", "chercheur", "doctorant", "stagiaire"]

    # NER pour Location et University
    for ent in doc.ents:
        if ent.label_ == "LOC":
            profile["location"] = ent.text
        if ent.label_ == "ORG":
            profile["university"] = ent.text

    # Fallback regex pour University et Location
    uni_fallback = extract_university_regex(text)
    if uni_fallback:
        profile["university"] = uni_fallback
    
    loc_fallback = extract_location_regex(text)
    if loc_fallback:
        profile["location"] = loc_fallback

    # üîπ Nom : priorit√© √† la regex "Je suis / Je m'appelle"
    match_name = re.search(
        r"(?:Je m'appelle|Je suis)\s+([A-Z][a-z]+(?:[-\s][A-Z][a-z]+)*)",
        text
    )
    if match_name:
        profile["name"] = match_name.group(1)
    else:
        # Sinon, prendre le premier PER valide d√©tect√© par spaCy
        for ent in doc.ents:
            if ent.label_ == "PER":
                if ent.text.lower() not in invalid_person_words:
                    profile["name"] = ent.text
                    break

    # Email
    email_match = re.search(r"[\w\.-]+@[\w\.-]+", text)
    if email_match:
        profile["email"] = email_match.group(0)
    
    # T√©l√©phone
    phone_match = re.search(r"\+?\d[\d\s]{7,}", text)
    if phone_match:
        profile["phone"] = phone_match.group(0)
    
    # Date de naissance
    date_match = re.search(r"(\d{2}-\d{2}-\d{4})", text)
    if date_match:
        profile["birthdate"] = date_match.group(1)

    return profile


In [27]:
text="Je suis Achraf Sakka Rouis etudiant √† Polytechnique, j'habite √† Sousse.Mon email est achraf.sr@gmail.com Je suis n√© le 12-08-2002. Mon num√©ro : +216 22 333 444."
profile=build_profile(text)
profile

{'name': 'Achraf Sakka Rouis',
 'location': 'Sousse',
 'university': 'Polytechnique',
 'email': 'achraf.sr@gmail.com',
 'phone': '+216 22 333 444',
 'birthdate': '12-08-2002'}

In [17]:
nlp = spacy.load("fr_core_news_lg")
df['profile'] = df['text_clear'].apply(build_profile)

In [18]:
print(df[['text_clear', 'profile']].head(5))

                                          text_clear  \
0  Je suis Honore Allard, ne(e) le 30-01-2005, et...   
1  Je suis Roland Teixeira, ne(e) le 17-06-1995, ...   
2  Je suis Francoise Gosselin, ne(e) le 15-08-199...   
3  Je suis Olivier Seguin, ne(e) le 18-10-1995, e...   
4  Je suis Alfred Lopes-Dupont, ne(e) le 22-03-19...   

                                             profile  
0  {'name': 'Honore Allard', 'location': 'Bizerte...  
1  {'name': 'Roland Teixeira', 'location': 'Bizer...  
2  {'name': 'Francoise Gosselin', 'location': 'Sf...  
3  {'name': 'Olivier Seguin', 'location': 'Tunis'...  
4  {'name': 'Alfred Lopes-Dupont', 'location': 'S...  


In [19]:
df.tail()

Unnamed: 0,text,text_clear,entities,profile
295,"Je suis Anouk Roche, n√©(e) le 20 March 2001, √©...","Je suis Anouk Roche, ne(e) le 20-03-2001, etud...","{'PER': ['Anouk Roche'], 'LOC': ['Sousse'], 'O...","{'name': 'Anouk Roche', 'location': 'Sousse', ..."
296,"Je suis No√©mi-Anne Buisson, n√©(e) le 8 Februar...","Je suis Noemi-Anne Buisson, ne(e) le 08-02-200...","{'PER': ['Anne Buisson'], 'LOC': ['etudier a F...","{'name': 'Noemi-Anne Buisson', 'location': 'Bi..."
297,"Je suis Matthieu Leblanc, n√©(e) le 19 February...","Je suis Matthieu Leblanc, ne(e) le 19-02-1995,...","{'PER': ['Matthieu Leblanc', 'etudiant(e'], 'L...","{'name': 'Matthieu Leblanc', 'location': 'Sfax..."
298,"Je suis Hortense Robert, n√©(e) le 08/03/1995, ...","Je suis Hortense Robert, ne(e) le 08-03-1995, ...","{'PER': ['Hortense Robert', 'etudiant(e'], 'LO...","{'name': 'Hortense Robert', 'location': 'Sfax'..."
299,"Je suis Vincent Vidal, n√©(e) le 19/03/1996, √©t...","Je suis Vincent Vidal, ne(e) le 19-03-1996, et...","{'PER': ['Vincent Vidal', 'etudiant(e'], 'LOC'...","{'name': 'Vincent Vidal', 'location': 'Tunis',..."


In [20]:
# EXPANSION : transformer les dictionnaires en colonnes
profiles_df = pd.json_normalize(df["profile"])

In [21]:
profiles_df

Unnamed: 0,name,location,university,email,phone,birthdate
0,Honore Allard,Bizerte,ENIT,unknown,30-01-2005,30-01-2005
1,Roland Teixeira,Bizerte,unknown,unknown,17-06-1995,17-06-1995
2,Francoise Gosselin,Sfax,ENIT,unknown,15-08-1996,15-08-1996
3,Olivier Seguin,Tunis,ENIT,unknown,18-10-1995,18-10-1995
4,Alfred Lopes-Dupont,Sfax,unknown,unknown,22-03-1998,22-03-1998
...,...,...,...,...,...,...
295,Anouk Roche,Sousse,ENIT,unknown,20-03-2001,20-03-2001
296,Noemi-Anne Buisson,Bizerte,unknown,unknown,08-02-2007,08-02-2007
297,Matthieu Leblanc,Sfax,Universite de Tunis,unknown,19-02-1995,19-02-1995
298,Hortense Robert,Sfax,IHEC,unknown,08-03-1995,08-03-1995


In [22]:
profiles_df.to_csv("profiles.csv", index=False)
print("profiles.csv g√©n√©r√© avec succ√®s !")

profiles.csv g√©n√©r√© avec succ√®s !


### on est besoin du texte original avant / apres le process de nettoyage et NER NLP technologies 
car Si j'exporte juste profiles_df, je perd la r√©f√©rence au texte original.

In [23]:
final_df = pd.concat([df["text"], profiles_df], axis=1) #concatenation horizantal (axis=1)

In [24]:
final_df

Unnamed: 0,text,name,location,university,email,phone,birthdate
0,"Je suis Honor√© Allard, n√©(e) le 30 January 200...",Honore Allard,Bizerte,ENIT,unknown,30-01-2005,30-01-2005
1,"Je suis Roland Teixeira, n√©(e) le 17 June 1995...",Roland Teixeira,Bizerte,unknown,unknown,17-06-1995,17-06-1995
2,"Je suis Fran√ßoise Gosselin, n√©(e) le 15 August...",Francoise Gosselin,Sfax,ENIT,unknown,15-08-1996,15-08-1996
3,"Je suis Olivier Seguin, n√©(e) le 18 October 19...",Olivier Seguin,Tunis,ENIT,unknown,18-10-1995,18-10-1995
4,"Je suis Alfred Lopes-Dupont, n√©(e) le 22 March...",Alfred Lopes-Dupont,Sfax,unknown,unknown,22-03-1998,22-03-1998
...,...,...,...,...,...,...,...
295,"Je suis Anouk Roche, n√©(e) le 20 March 2001, √©...",Anouk Roche,Sousse,ENIT,unknown,20-03-2001,20-03-2001
296,"Je suis No√©mi-Anne Buisson, n√©(e) le 8 Februar...",Noemi-Anne Buisson,Bizerte,unknown,unknown,08-02-2007,08-02-2007
297,"Je suis Matthieu Leblanc, n√©(e) le 19 February...",Matthieu Leblanc,Sfax,Universite de Tunis,unknown,19-02-1995,19-02-1995
298,"Je suis Hortense Robert, n√©(e) le 08/03/1995, ...",Hortense Robert,Sfax,IHEC,unknown,08-03-1995,08-03-1995


In [25]:
final_df.to_csv("profiles.csv", index=False)