In [9]:
# Load and review the content of the CSV file to understand its structure and content.
import pandas as pd
from spacy.lang.fr import French
import spacy
import re
import csv
from joblib import Parallel, delayed
import requests


In [None]:
!python -m spacy download fr_core_news_sm
!pip install joblib

# 1. Pre-Process
Preprocess was already done on the dataset


#2. Entity Recognition


In [11]:
# Load the CSV file with additional parameters for handling potential errors
data_df = pd.read_csv("data.csv", sep="\t")

# Display the first few rows to understand the data structure, especially the 'description' column.
data_df.head()


Unnamed: 0,title,description,date,order,presenter,editor,url,urlTvNews,containsWordGlobalWarming,media,month,day
0,,,2013-11-02T00:00:00.000Z,21.0,Laurent Delahousse,,https://www.francetvinfo.fr/sujet_449888.html,https://www.francetvinfo.fr/replay-jt/france-2...,False,France 2,11.0,2.0
1,"\Cash Investigation\"" ce soir à 22h30""",On avait parlé de l'évasion fiscale des multin...,2013-09-18T00:00:00.000Z,10.0,Elise Lucet,,https://www.francetvinfo.fr/cash-investigation...,https://www.francetvinfo.fr/replay-jt/france-2...,False,France 2,9.0,18.0
2,"\Elle Man\"", nouveau magazine en kiosque""",Voici leurs portraits. Nous ne les oublions pa...,2013-10-02T00:00:00.000Z,22.0,David Pujadas,,https://www.francetvinfo.fr/elle-man-nouveau-m...,https://www.francetvinfo.fr/replay-jt/france-2...,False,France 2,10.0,2.0
3,11 novembre : incidents pendant les commémorat...,"Autre titre, les cérémonies du 1. Novembre. On...",2013-11-11T00:00:00.000Z,9.0,David Pujadas,,https://www.francetvinfo.fr/11-novembre-incide...,https://www.francetvinfo.fr/replay-jt/france-2...,False,France 2,11.0,11.0
4,132 millions d'euros : coût et rapport,"132 millions d'euros, c'est bien sûr un gain q...",2013-03-29T23:00:00.000Z,6.0,Laurent Delahousse,,https://www.francetvinfo.fr/132-millions-d-eur...,https://www.francetvinfo.fr/replay-jt/france-2...,False,France 2,3.0,29.0


In [12]:
# Create a Spacy French language class
nlp = French()

In [13]:
# not yet data_df['description_cleaned'] = data_df['description_cleaned'].astype(str)

data_df['description'] = data_df['description'].astype(str)
# Load the French NLP model from Spacy
nlp_fr = spacy.load("fr_core_news_sm")

# Function to extract entities and their labels from a text
def extract_entities(text):
    # Process the text with the NLP model
    doc = nlp_fr(text)
    # Extract entities and their labels
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['PER']]
    return entities

# Apply the entity extraction to the first few cleaned descriptions
sample_descriptions = data_df['description'] # Using a small sample for demonstration
data_df['entities_in_descriptions'] = sample_descriptions.apply(extract_entities)

# Display the entities extracted from each description
data_df['entities_in_descriptions']


0                                                      []
1                                                      []
2                                           [(Man\, PER)]
3       [(François Hollande, PER), (Manuel Valls, PER)...
4              [(David DOUKHAN, PER), (Daniel LEVY, PER)]
                              ...                        
3837                                                   []
3838       [(Toni Muselin, PER), (Francesco Pisano, PER)]
3839                            [(Vladimir Poutine, PER)]
3840                            [(Francesco Pisanu, PER)]
3841           [(David Lefort, PER), (David Lefort, PER)]
Name: entities_in_descriptions, Length: 3842, dtype: object

In [17]:
# Load the French NLP model with only the entity recognizer for efficiency
nlp_fr = spacy.load("fr_core_news_sm", disable=["tagger", "parser"])

# Use SpaCy's pipe method for efficient batch processing
texts = data_df['description'].astype(str).tolist()  # Ensure texts are in a list
entities = list(nlp_fr.pipe(texts, batch_size=50, disable=["tagger", "parser"]))  # Adjust batch_size based on your system

# Extract entities and update the DataFrame
data_df['entities_in_descriptions'] = [[(ent.text, ent.label_) for ent in doc.ents if ent.label_ == 'PER'] for doc in entities]




In [19]:
data_df.to_csv('data_ner.csv', index=False)

#3. Gender Prediction


In [None]:
# Cache for storing gender predictions
gender_cache = {}

def gender_predictor(name):
    # Check cache first
    if name in gender_cache:
        return gender_cache[name]

    # Make a new request if the name was not in the cache
    response = requests.get(f'https://api.genderize.io?name={name}')
    if response.status_code == 200:
        result = response.json()
        gender = result.get('gender', 'unknown')
        # Store the result in the cache
        gender_cache[name] = gender
        return gender
    else:
        return 'unknown'


# Example of usage
name_gender = gender_predictor('Alice')
print(name_gender)  # Should print 'female', 'male', or 'unknown'


female


In [18]:
# Extract first and last names from entities
first_names = []
last_names = []

for entities in data_df['entities_in_descriptions']:
    for entity in entities:
        name_parts = entity[0].split()
        if len(name_parts) == 1:
            first_names.append(name_parts[0])  # Only a first name is present
            last_names.append('')  # No last name present
        else:
            first_names.append(name_parts[0])  # First part is the first name
            last_names.append(' '.join(name_parts[1:]))  # The rest is considered as the last name

# Create a new DataFrame
names_df = pd.DataFrame({'First Name': first_names, 'Last Name': last_names})


In [16]:
unique_names_df = names_df.drop_duplicates(subset=['First Name'])

# Save this new DataFrame to a CSV file
unique_names_df.to_csv('names_dataset.csv', index=False)

In [17]:
unique_names_df #reset index

Unnamed: 0,First Name,Last Name
0,Mélenchon,
2,Thierry,Pouret
3,JeanMarie,Bachelet
4,François,Fillon
5,Alain,Juppé
...,...,...
13718,Huckabee,Sanders
13719,Reince,Priebus
13733,Riyad,Arabie
13737,Europe,Vatican
