In [44]:
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import pandas as pd
from faker import Faker
import numpy as np
import random
from textblob import TextBlob
import requests
import re

## Define Helper Functions
This section includes functions to simulate the generation of person attributes such as age, weight, geographic code, sentiment analysis, and more.


In [45]:
def find_random_age_by_gender(sexe):
    age_data = {
        "25-29 ans": {"Femme": 1897867, "Homme": 1887312},
    }
    age_group = random.choice(list(age_data.keys()))
    return random.randint(25, 29), age_group

def estimer_poids(taille_cm, imc_cible=22):
    taille_m = taille_cm / 100
    return imc_cible * (taille_m ** 2)

def get_iris_code_from_coordinates(latitude, longitude):
    return "IRIS_code_example"

def analyze_sentiment_textblob(text):
    blob = TextBlob(text)
    return {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}

def statcouple(iris_code, basecouple):
    return {"unionlibre": 0.1, "pacsée": 0.2, "marier": 0.3, "veuf": 0.1, "divorce": 0.2, "celib": 0.1, "inconnu": 0}

def random_status(proba_statuts):
    statuts = list(proba_statuts.keys())
    probabilities = list(proba_statuts.values())
    return np.random.choice(statuts, p=probabilities)

def age_to_interval(age_str):
    try:
        # Utiliser des expressions régulières pour extraire les nombres
        numbers = re.findall(r'\d+', age_str)
        if age_str == "Ensemble" or "en millions" in age_str:
            return None
        if "plus" in age_str:
            # Gérer les cas comme "65 ans ou plus"
            return (int(numbers[0]), float(150))
        elif len(numbers) >= 2:
            # Gérer les cas avec deux nombres, comme "15 à 19 ans"
            return (int(numbers[0]), int(numbers[1]))
        else:
            raise ValueError("Format d'âge non reconnu")
    except ValueError as e:
        print(f"Erreur avec l'entrée : '{age_str}' - {e}")
        raise

def find_age_interval(age, df):
    for interval in df['Age Range']:
        if interval != None and interval[0] <= age and age <= interval[1]:
            return interval
    return "Âge non trouvé dans les intervalles"

def get_proportion(age, gender, df):
    # Trouver l'intervalle d'âge
    interval = find_age_interval(age, df)
    if interval == "Âge non trouvé dans les intervalles":
        return interval
    
    # Sélectionner la ligne correspondante à l'intervalle d'âge
    row = df[df['Age Range'] == interval]
    
    # Sélectionner la colonne en fonction du sexe
    if gender.lower() == 'femme':
        proportion = row['Femmes'].values[0]
    elif gender.lower() == 'homme':
        proportion = row['Hommes'].values[0]
    else:
        return "Sexe non reconnu"
    
    # Construire la phrase récapitulative
    return proportion

## Define the Main Function for Person Creation
This function utilizes the Faker library to generate fake personal information and incorporates previously defined helper functions to simulate complex attributes.

In [46]:
def create_person(id, basecouple, datacouple, personnevivantseul):
    fake = Faker('fr_FR')
    fake.seed_instance(id)
    
    sexe = random.choice(["Homme", "Femme"])
    taille = np.random.normal(175, 7) if sexe == "Homme" else np.random.normal(162, 7)
    age, age_group = find_random_age_by_gender(sexe)
    
    bio = "This is a sample biography."
    # Emotion probabilities
    emotions = {
        'angry': 0.10,   # 10% chance
        'disgust': 0.05,  # 5% chance
        'fear': 0.10,    # 10% chance
        'happy': 0.40,   # 40% chance
        'sad': 0.20,     # 20% chance
        'surprise': 0.10, # 10% chance
        'neutral': 0.05   # 5% chance
    }
    emotion = random.choices(list(emotions.keys()), weights=emotions.values(), k=1)[0]

    # Race probabilities
    races = ['asian', 'indian', 'black', 'white', 'middle eastern', 'latino hispanic']
    weights = [0.05, 0.02, 0.03, 0.85, 0.03, 0.02]  # Example values
    race = random.choices(races, weights=weights, k=1)[0]

    # Education and family status (simplified logic)
    pourcentage_seul = get_proportion(age, sexe, personnevivantseul)   
    seul = random.random() < ((pourcentage_seul+20) / 100)
    couple_meme_sexe = random.choice([True, False])
    enfant = random.choice([True, False])
    education_bac = random.choice([True, False])
    education_bac_plus_3 = education_bac and random.choice([True, False])

    latitude, longitude = 0, 0
    iris_code = get_iris_code_from_coordinates(latitude, longitude)
    sentiment = analyze_sentiment_textblob(bio)
    proba_statuts = statcouple(iris_code, basecouple)
    statut_relationnel = random_status(proba_statuts)
    pourcentage_seul = get_proportion(age, sexe, personnevivantseul)
    seul = random.random() < pourcentage_seul / 100
    
    person_data = {
        'id': id,
        'nom': fake.last_name(),
        'prenom': fake.first_name(),
        'age': age,
        'sexe': sexe,
        'taille': taille,
        'poids': estimer_poids(taille),
        'bio': bio,
        'emotion': emotion,
        'race': race,
        'seul': seul,
        'couple_meme_sexe': couple_meme_sexe,
        'enfant': enfant,
        'education_bac': education_bac,
        'education_bac_plus_3': education_bac_plus_3,
        'latitude': latitude,
        'longitude': longitude,
        'IRIS code': iris_code,
        'sentiment': sentiment,
        'statut relationnel': statut_relationnel,
        'vivre seul': seul,
    }
    return person_data

## Generate Person Data
This section describes the use of parallel processing to efficiently generate data for a large number of individuals.

In [ ]:
basecouple = pd.read_excel("data//base-ic-couples-familles-menages-2020.xlsx", sheet_name="IRIS")
basecouple = basecouple.drop(index=0)

In [ ]:
datacouple = pd.read_excel("data//ip1774.xls", sheet_name="Figure 1")
datacouple = datacouple.dropna()

In [ ]:
personnevivantseul = pd.read_excel("data//demo-couple-pers-seul-log-age.xlsx")
if not isinstance(personnevivantseul['Age'].iloc[0], pd.Interval):
    personnevivantseul['Age Range'] = personnevivantseul['Age'].apply(age_to_interval)

if not isinstance(datacouple['Age'].iloc[0], pd.Interval):
    datacouple['Age Range'] = datacouple['Age'].apply(age_to_interval)

In [48]:
def generate_person_data(num_defunts, basecouple, datacouple, personnevivantseul):
    iterable = tqdm(range(1, num_defunts + 1), desc="Generating persons")
    results = Parallel(n_jobs=-1)(delayed(create_person)(i, basecouple, datacouple, personnevivantseul) for i in iterable)
    return results

if __name__ == "__main__":
    num_defunts = 100
    defunts_data = generate_person_data(num_defunts, basecouple, datacouple, personnevivantseul)
    df_defunts = pd.DataFrame(defunts_data)
    df_defunts.to_excel("MultiGeneratedData.xlsx", index=False)
    print(df_defunts.head())


Generating persons:   0%|          | 0/100 [00:00<?, ?it/s]

   id      nom   prenom  age   sexe      taille      poids  \
0   1    Lucas   Hélène   27  Femme  170.581370  64.015608   
1   2  Lefèvre     Éric   26  Femme  166.379516  60.900716   
2   3    Gomez   Jeanne   27  Homme  177.644718  69.426820   
3   4     Huet  Thibaut   27  Homme  171.246588  64.515867   
4   5  Loiseau   Pierre   27  Femme  161.460959  57.353211   

                           bio emotion   race  ...  couple_meme_sexe  enfant  \
0  This is a sample biography.     sad  white  ...              True    True   
1  This is a sample biography.    fear  white  ...             False   False   
2  This is a sample biography.   angry  white  ...              True    True   
3  This is a sample biography.   angry  white  ...              True    True   
4  This is a sample biography.   happy  white  ...              True    True   

   education_bac  education_bac_plus_3  latitude  longitude  \
0           True                 False         0          0   
1          False    