In [88]:
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import pandas as pd
from faker import Faker
import numpy as np
import random
from textblob import TextBlob
import requests
import re
from transformers import pipeline

## Define Helper Functions
This section includes functions to simulate the generation of person attributes such as age, weight, geographic code, sentiment analysis, and more.


In [89]:
def find_random_age_by_gender(sexe):
    age_data = {
        "25-29 ans": {"Femme": 1897867, "Homme": 1887312},
    }
    age_group = random.choice(list(age_data.keys()))
    return random.randint(25, 29), age_group

def estimer_poids(taille_cm, imc_cible=22):
    taille_m = taille_cm / 100
    return imc_cible * (taille_m ** 2)

def get_iris_code_from_coordinates(latitude, longitude):
    return "IRIS_code_example"

def analyze_sentiment_textblob(text):
    blob = TextBlob(text)
    return {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}

def analyze_sentiment_transformers(text):
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    classifier = pipeline("sentiment-analysis", model=model_name)
    return classifier(text)

def statcouple(iris_code, basecouple):
    return {"union_libre": 0.1, "pacsée": 0.2, "marier": 0.3, "veuf": 0.1, "divorce": 0.2, "celib": 0.1, "inconnu": 0}

def random_status(proba_statuts):
    statuts = list(proba_statuts.keys())
    probabilities = list(proba_statuts.values())
    return np.random.choice(statuts, p=probabilities)

def age_to_interval(age_str):
    try:
        # Utiliser des expressions régulières pour extraire les nombres
        numbers = re.findall(r'\d+', age_str)
        if age_str == "Ensemble" or "en millions" in age_str:
            return None
        if "plus" in age_str:
            # Gérer les cas comme "65 ans ou plus"
            return (int(numbers[0]), float(150))
        elif len(numbers) >= 2:
            # Gérer les cas avec deux nombres, comme "15 à 19 ans"
            return (int(numbers[0]), int(numbers[1]))
        else:
            raise ValueError("Format d'âge non reconnu")
    except ValueError as e:
        print(f"Erreur avec l'entrée : '{age_str}' - {e}")
        raise

def find_age_interval(age, df):
    for interval in df['Age Range']:
        if interval != None and interval[0] <= age and age <= interval[1]:
            return interval
    return "Âge non trouvé dans les intervalles"

def get_proportion(age, gender, df):
    # Trouver l'intervalle d'âge
    interval = find_age_interval(age, df)
    if interval == "Âge non trouvé dans les intervalles":
        return interval
    
    # Sélectionner la ligne correspondante à l'intervalle d'âge
    row = df[df['Age Range'] == interval]
    
    # Sélectionner la colonne en fonction du sexe
    if gender.lower() == 'femme':
        proportion = row['Femmes'].values[0]
    elif gender.lower() == 'homme':
        proportion = row['Hommes'].values[0]
    else:
        return "Sexe non reconnu"
    
    # Construire la phrase récapitulative
    return proportion

def random_with_proba(probability):
    """Renvoie True avec la probabilité indiquée, sinon False."""
    return random.random() < (round(float(probability)) / 100)

def education_famille(data_for_gender):
    output = {}
    # Déterminons si la personne fait partie d'un couple de même sexe
    
    output["couple_meme_sexe"] = random_with_proba(data_for_gender[data_for_gender.index[0]])

    # Déterminons le niveau d'éducation de la personne
    # On suppose que l'éducation est indépendante du fait d'être en couple de même sexe ou non
    output["education_bac_plus_3"] = False
    output["education_bac"] = random_with_proba(data_for_gender[data_for_gender.index[2]]) if output["couple_meme_sexe"] \
                    else random_with_proba(data_for_gender[data_for_gender.index[3]])
    if output["education_bac"]:
        output["education_bac_plus_3"] = random_with_proba(data_for_gender[data_for_gender.index[4]]) if output["couple_meme_sexe"] \
                            else random_with_proba(data_for_gender[data_for_gender.index[5]])

    # On suppose ici que le fait d'avoir un enfant est indépendant du niveau d'éducation et du type de couple
    output["enfant"] = random_with_proba(data_for_gender[data_for_gender.index[-1]]) if output["couple_meme_sexe"] \
            else False  # Nous n'avons pas de données pour les couples de sexe différent avec enfants
    return output

def get_data_couple_age(age, gender, df):
    # Adjust the filter to account for both singular and plural forms of "Homme/Hommes"
    gender_str = 'Homme' if gender.lower() == 'homme' else 'Femme'
    
    # Find the age interval
    interval = find_age_interval(age, df)
    if interval == "Âge non trouvé dans les intervalles":
        return interval
    
    # Select the row corresponding to the age interval
    row = df[df['Age Range'] == interval]
    
    # Filter the columns based on gender, allowing for both 'Homme' and 'Hommes'
    gender_columns = [col for col in df.columns if gender_str in col]
    
    # Select only the columns that match the gender
    gender_data = row[gender_columns].iloc[0]  # Use iloc[0] to select the first (and only) row as a Series
    
    return gender_data  # Convert to dictionary for easier readability

## Define the Main Function for Person Creation
This function utilizes the Faker library to generate fake personal information and incorporates previously defined helper functions to simulate complex attributes.

In [90]:
def create_person(id, basecouple, datacouple, personnevivantseul):
    fake = Faker('fr_FR')
    fake.seed_instance(id)
    
    sexe = random.choice(["Homme", "Femme"])
    taille = np.random.normal(175, 7) if sexe == "Homme" else np.random.normal(162, 7)
    age, age_group = find_random_age_by_gender(sexe)
    
    bio = random.choice(bios).strip()
    
    sentiment = analyze_sentiment_transformers(bio)
    # Emotion probabilities
    emotions = {
        'angry': 0.10,   # 10% chance
        'disgust': 0.05,  # 5% chance
        'fear': 0.10,    # 10% chance
        'happy': 0.40,   # 40% chance
        'sad': 0.20,     # 20% chance
        'surprise': 0.10, # 10% chance
        'neutral': 0.05   # 5% chance
    }
    emotion = random.choices(list(emotions.keys()), weights=emotions.values(), k=1)[0]

    # Race probabilities
    races = ['asian', 'indian', 'black', 'white', 'middle eastern', 'latino hispanic']
    weights = [0.05, 0.02, 0.03, 0.85, 0.03, 0.02]  # Example values
    race = random.choices(races, weights=weights, k=1)[0]

    # Education and family status (simplified logic)
    pourcentage_seul = get_proportion(age, sexe, personnevivantseul)   
    seul = random.random() < ((pourcentage_seul+20) / 100)
    
    data_for_gender = get_data_couple_age(age,  sexe, datacouple)
    result = education_famille(data_for_gender)
    couple_meme_sexe = result["couple_meme_sexe"]
    enfant = result["enfant"]
    education_bac = result["education_bac"]
    education_bac_plus_3 = result["education_bac_plus_3"]

    latitude, longitude = 0, 0
    iris_code = get_iris_code_from_coordinates(latitude, longitude)
    
    
    proba_statuts = statcouple(iris_code, basecouple)
    statut_relationnel = random_status(proba_statuts)
    
    pourcentage_seul = get_proportion(age, sexe, personnevivantseul)
    seul = random.random() < pourcentage_seul / 100
    
    person_data = {
        'id': id,
        'nom': fake.last_name(),
        'prenom': fake.first_name(),
        'age': age,
        'sexe': sexe,
        'taille': taille,
        'poids': estimer_poids(taille),
        'bio': bio,
        'sentiment': sentiment,
        'photo': '',
        'emotion': emotion,
        'race': race,
        'seul': seul,
        'couple_meme_sexe': couple_meme_sexe,
        'enfant': enfant,
        'education_bac': education_bac,
        'education_bac_plus_3': education_bac_plus_3,
        'latitude': latitude,
        'longitude': longitude,
        'IRIS code': iris_code,
        'statut relationnel': statut_relationnel,
        'vivre seul': seul,
    }
    return person_data

## Load Data for Person Generation

In [91]:
basecouple = pd.read_excel("data//base-ic-couples-familles-menages-2020.xlsx", sheet_name="IRIS")
basecouple = basecouple.drop(index=0)

In [92]:
datacouple = pd.read_excel("data//ip1774.xls", sheet_name="Figure 1")
datacouple = datacouple.dropna()

In [93]:
personnevivantseul = pd.read_excel("data//demo-couple-pers-seul-log-age.xlsx")
if not isinstance(personnevivantseul['Age'].iloc[0], pd.Interval):
    personnevivantseul['Age Range'] = personnevivantseul['Age'].apply(age_to_interval)

if not isinstance(datacouple['Age'].iloc[0], pd.Interval):
    datacouple['Age Range'] = datacouple['Age'].apply(age_to_interval)

In [94]:
with open('Biotinder.txt', 'r', encoding='utf-8') as file:
        bios = file.readlines()

## Generate Person Data
This section describes the use of parallel processing to efficiently generate data for a large number of individuals.

In [95]:
def generate_person_data(num_defunts, basecouple, datacouple, personnevivantseul):
    iterable = tqdm(range(1, num_defunts + 1), desc="Generating persons")
    results = Parallel(n_jobs=-1)(delayed(create_person)(i, basecouple, datacouple, personnevivantseul) for i in iterable)
    return results

if __name__ == "__main__":
    num_defunts = 100
    defunts_data = generate_person_data(num_defunts, basecouple, datacouple, personnevivantseul)
    df_defunts = pd.DataFrame(defunts_data)
    df_defunts.to_excel("MultiGeneratedData.xlsx", index=False)
    print(df_defunts.head())


Generating persons:   0%|          | 0/100 [00:00<?, ?it/s]

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint 

   id      nom   prenom  age   sexe      taille      poids  \
0   1    Lucas   Hélène   29  Homme  169.530573  63.229353   
1   2  Lefèvre     Éric   27  Femme  157.846593  54.814204   
2   3    Gomez   Jeanne   25  Homme  180.047973  71.318000   
3   4     Huet  Thibaut   29  Homme  195.528549  84.109110   
4   5  Loiseau   Pierre   28  Homme  176.678304  68.673490   

                                                 bio  \
0  Amateur d'astronomie, cherche étoile pour obse...   
1  Mélomane de comédies musicales, cherche quelqu...   
2  Photographe de nature, cherche regard pour cap...   
3  Fan de mode et de design, cherche âme sœur pou...   
4  Amateur de livres, de jazz et de conversations...   

                                           sentiment photo  ...   seul  \
0  [{'label': 'POSITIVE', 'score': 0.703760206699...        ...   True   
1  [{'label': 'NEGATIVE', 'score': 0.682659327983...        ...  False   
2  [{'label': 'POSITIVE', 'score': 0.997094035148...        ...   Tr

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
