# Extraction et chargement des données

In [24]:
import os
os.getcwd()

'c:\\Users\\aitha\\Documents\\master\\M2\\Text_Mining_Challenge_EGC\\Scripts'

### Extraction des données brutes

In [None]:
import pandas as pd
from langdetect import detect

def charger_donnees(chemin):
    return pd.read_csv(chemin, sep=None, engine='python')

df = charger_donnees('../01.Data/Sources/export_articles_EGC_2004_2018.csv')
dfEditions = charger_donnees('../01.Data/Sources/edition_EGC.csv')
dfGenrePrenoms = pd.read_csv("../01.Data/Tables/table_genres_clean.csv", sep=None, engine='python', encoding='cp1252')  

### Table Articles

In [26]:
def identifier_langue(texte):
    try:
        if pd.isna(texte) or str(texte).strip() == "":
            return "vide"
        return detect(str(texte))
    except:
        return "erreur"

def count_authors(df_articles, df_authors, df_articles_authors):
    merged = (
        df_articles_authors
        .merge(df_authors, on="id_author", how="left")
    )

    stats = (
        merged
        .groupby("id_article")
        .agg(
            Nb_authors=("id_author", "count"),
            Nb_male=("gender", lambda x: (x == "male").sum()),
            Nb_female=("gender", lambda x: (x == "female").sum()),
        )
        .reset_index()
    )

    df_articles = (
        df_articles
        .drop(columns=["Nb_authors", "Nb_male", "Nb_female"], errors="ignore")
        .merge(stats, on="id_article", how="left")
    )

    df_articles[["Nb_authors", "Nb_male", "Nb_female"]] = (
        df_articles[["Nb_authors", "Nb_male", "Nb_female"]]
        .fillna(0)
        .astype(int)
    )

    return df_articles

### Table Authors

In [27]:
def authors_info(df, col='authors'):
    dfAuthors = (
        df[col]
        .str.split(', ')
        .explode()
        .dropna()                  
        .str.strip()               
        .drop_duplicates()        
        .reset_index(drop=True)
        .to_frame(name='name')
    )
    if 'id_author' not in dfAuthors.columns:
        dfAuthors['id_author'] = range(1, len(dfAuthors) + 1)

    return dfAuthors


def authors_gender(dfAuthors, dfGenrePrenoms):
    dfAuthors = dfAuthors.copy()
    dfAuthors['prenom'] = dfAuthors['name'].str.split(' ').str[0].str.lower().str.strip()

    dfGenrePrenoms_tmp = dfGenrePrenoms.copy()
    dfGenrePrenoms_tmp['prenom_cle'] = dfGenrePrenoms_tmp['prenom_cle'].str.lower().str.strip()

    prenom_to_genre = dfGenrePrenoms_tmp.set_index('prenom_cle')['genre_estime'].to_dict()

    dfAuthors['gender'] = dfAuthors['prenom'].map(prenom_to_genre).fillna('pas encore')

    dfAuthors = dfAuthors.drop(columns=['prenom'])

    return dfAuthors



### Table lien entre Articles et Authors

In [28]:
def correspondance_table_authors_and_articles(dfAu, dfAr):
    
    df_tmp = (
        dfAr[['id_article', 'authors']]
        .assign(name=dfAr['authors'].str.split(', '))
        .explode('name')
    )

  
    df_tmp['position_author'] = (
        df_tmp.groupby('id_article').cumcount() + 1
    )


    df_link = df_tmp.merge(
        dfAu[['id_author', 'name']],
        on='name',
        how='inner'
    )

   
    return (
        df_link[['id_author', 'id_article', 'position_author']]
        .sort_values(['id_article', 'position_author'])
        .reset_index(drop=True)
    )

### Table Editions

In [29]:
def nb_articles_per_edition(df_art, df_edit): 
    counts = df_art['year'].value_counts().reset_index()
    counts.columns = ['year', 'nb_articles']

    df_result = df_edit.merge(counts, on='year', how='left')
    df_result['nb_articles'] = df_result['nb_articles'].fillna(0).astype(int)
    
    return df_result

### Ingestion des données dans des csv

In [None]:
# DataFrame Authors
dfAuthors = pd.DataFrame()
dfAuthors.insert(0, 'id_author', range(1, len(dfAuthors) + 1))
dfAuthors = authors_info(df)
dfAuthors = authors_gender(dfAuthors, dfGenrePrenoms)

dfAuthors.to_csv('../01.Data/Tables/authors.csv', index=False)

In [None]:
# DataFrame Articles_Authors
dfArticles_Authors = pd.DataFrame()
if 'id_article' not in df.columns:
    df.insert(0, 'id_article', range(1, len(df) + 1))
dfArticles_Authors = correspondance_table_authors_and_articles(dfAuthors, df)

dfArticles_Authors.to_csv('../01.Data/Tables/articles_authors.csv', index=False)

In [None]:
# DataFrame Articles
# Traitement langue
dfArticles = df[['title', 'year']].copy()
dfArticles.insert(0, 'id_article', range(1, len(dfArticles) + 1))
dfArticles['language'] = df['title'].apply(identifier_langue)
# Nombre d'auteurs par article
dfArticles = count_authors(
    dfArticles,
    dfAuthors,
    dfArticles_Authors
)
dfClustered = charger_donnees('../01.Data/Tables/clustered_articles.csv')
dfArticles = dfArticles.merge(
    dfClustered[['id', 'theme', 'title_clean']],
    left_on='id_article',
    right_on='id',
    how='left'
).drop(columns='id')

dfArticles.to_csv('../01.Data/Tables/articles.csv', index=False)

In [None]:
# DataFrame Editions
dfEditions = nb_articles_per_edition(df, dfEditions)

dfEditions.to_csv('../01.Data/Tables/editions.csv', index=False)