### Extraction des données brutes

In [81]:
import pandas as pd
from langdetect import detect

def charger_donnees(chemin):
    return pd.read_csv(chemin, sep=None, engine='python')

df = charger_donnees('sources/export_articles_EGC_2004_2018.csv')
dfEditions = charger_donnees('sources/edition_EGC.csv')
dfGenrePrenoms = pd.read_csv("table_genres_clean.csv", sep=None, engine='python', encoding='ansi')


### Table Articles

In [None]:
def identifier_langue(texte):
    try:
        if pd.isna(texte) or str(texte).strip() == "":
            return "vide"
        return detect(str(texte))
    except:
        return "erreur"

### Table Authors

In [92]:
def authors_info(df, col='authors'):
    dfAuthors = (
        df[col]
        .str.split(', ')
        .explode()
        .dropna()                  
        .str.strip()               
        .drop_duplicates()        
        .reset_index(drop=True)
        .to_frame(name='name')
    )

    dfAuthors['id_author'] = range(1, len(dfAuthors) + 1)
    dfAuthors['gender'] = "pas encore"

    return dfAuthors


def authors_gender(dfAuthors, dfGenrePrenoms):
    dfAuthors = dfAuthors.copy()
    dfAuthors['prenom'] = dfAuthors['name'].str.split(' ').str[0].str.lower().str.strip()

    dfGenrePrenoms_tmp = dfGenrePrenoms.copy()
    dfGenrePrenoms_tmp['prenom_cle'] = dfGenrePrenoms_tmp['prenom_cle'].str.lower().str.strip()

    prenom_to_genre = dfGenrePrenoms_tmp.set_index('prenom_cle')['genre_estime'].to_dict()

    dfAuthors['gender'] = dfAuthors['prenom'].map(prenom_to_genre).fillna('pas encore')

    dfAuthors = dfAuthors.drop(columns=['prenom'])

    return dfAuthors



### Table lien entre Articles et Authors

In [90]:
def correspondance_table_authors_and_articles(dfAu, dfAr) :
    df_tmp = (
        dfAr[['id_article', 'authors']]
        .assign(authors=dfAr['authors'].str.split(', '))
        .explode('authors')
        .rename(columns={'authors': 'name'})
    )

    print(df_tmp.head())

    df_link = df_tmp.merge(
        dfAu[['id_author', 'name']],
        on='name',
        how='inner'
    )

    print(df_link.head())

    return df_link[['id_author', 'id_article']].reset_index(drop=True)

### Table Editions

In [4]:
def nb_articles_per_edition(df_art, df_edit): 
    counts = df_art['year'].value_counts().reset_index()
    counts.columns = ['year', 'nb_articles']

    df_result = df_edit.merge(counts, on='year', how='left')
    df_result['nb_articles'] = df_result['nb_articles'].fillna(0).astype(int)
    
    return df_result

### Ingestion des données dans des csv

In [None]:
# DataFrame Articles
# Traitement langue
dfArticles = df[['title', 'year']].copy()
dfArticles.insert(0, 'id_article', range(1, len(dfArticles) + 1))
dfArticles['language'] = df['title'].apply(identifier_langue)
# Nombre d'auteurs par article
dfArticles['Nb_authors'] = df['authors'].apply(count_authors)
#dfArticles['Nb_male'] = dfArticles['Nb_authors']
#dfArticles['Nb_female'] = dfArticles['Nb_authors']
dfArticles['theme'] = "pas encore"
dfArticles['keywords'] = "pas encore"

dfArticles.to_csv('tables/articles.csv', index=False)

In [96]:
# DataFrame Authors
dfAuthors = pd.DataFrame()
dfAuthors.insert(0, 'id_author', range(1, len(dfAuthors) + 1))
dfAuthors['name'] = "pas encore"
dfAuthors['gender'] = "pas encore"
dfAuthors = authors_info(df)
dfAuthors = authors_gender(dfAuthors, dfGenrePrenoms)

dfAuthors.to_csv('tables/authors.csv', index=False)

In [97]:
# DataFrame Articles_Authors
dfArticles_Authors = pd.DataFrame()
if 'id_article' not in df.columns:
    df.insert(0, 'id_article', range(1, len(df) + 1))
dfArticles_Authors = correspondance_table_authors_and_articles(dfAuthors, df)

dfArticles_Authors.to_csv('tables/articles_authors.csv', index=False)

   id_article                 name
0           1     Claudia Marinica
0           1        Julien Longhi
0           1        Nader Hassine
0           1  Abdulhafiz Alkhouli
0           1         Boris Borzic
   id_article                 name  id_author
0           1     Claudia Marinica          1
1           1        Julien Longhi          2
2           1        Nader Hassine          3
3           1  Abdulhafiz Alkhouli          4
4           1         Boris Borzic          5


In [None]:
# DataFrame Editions
dfEditions = nb_articles_per_edition(df, dfEditions)

dfEditions.to_csv('tables/editions.csv', index=False)