In [2]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [34]:
def extract_author_title(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    title = file_path.split('/')[4]
    
    author_pattern = re.compile(r'\b(Mme|Madame|M|Mr|Dr|Docteur|docteur|Monsieur|monsieur|madame|professeur|Professeur)\b\s+(\S+\s+\S+\s+\S+)')
    author_match = author_pattern.search(content[:500])
    author = author_match.group(0) if author_match else 'auteur inconnu'
    
    return title, author


In [35]:
directory_mixte = '../data/txt/ouvrages_mixtes'

data_mixtes = []

for subdirectory in os.listdir(directory_mixte):
    subdirectory_path = os.path.join(directory_mixte, subdirectory)
    if os.path.isdir(subdirectory_path):

        num_pages = len([filename for filename in os.listdir(subdirectory_path) if filename.endswith('.txt')])

        for filename in os.listdir(subdirectory_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(subdirectory_path, filename)
                title, author = extract_author_title(file_path)
                if author != 'auteur inconnu':
                    data_mixtes.append([title, author, num_pages])

df = pd.DataFrame(data_mixtes, columns=['Titre', 'Auteur', 'Nbr_pages'])
df.to_csv('../data/contexte/contexte_mixtes.csv', index=False)

In [25]:
directory_feminins = '../data/txt/ouvrages_feminins'

data_feminins = []

for subdirectory in os.listdir(directory_feminins):
    subdirectory_path = os.path.join(directory_feminins, subdirectory)
    if os.path.isdir(subdirectory_path):

        num_pages = len([filename for filename in os.listdir(subdirectory_path) if filename.endswith('.txt')])

        for filename in os.listdir(subdirectory_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(subdirectory_path, filename)
                title, author = extract_author_title(file_path)
                if author != 'auteur inconnu':
                    data_feminins.append([title, author, num_pages])

df = pd.DataFrame(data_feminins, columns=['Titre', 'Auteur', 'Nbr_pages'])
df.to_csv('../data/contexte/contexte_feminins.csv', index=False)

In [29]:
directory_masculins = '../data/txt/ouvrages_masculins'

data_masculins = []

for subdirectory in os.listdir(directory_masculins):
    subdirectory_path = os.path.join(directory_masculins, subdirectory)
    if os.path.isdir(subdirectory_path):

        num_pages = len([filename for filename in os.listdir(subdirectory_path) if filename.endswith('.txt')])

        for filename in os.listdir(subdirectory_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(subdirectory_path, filename)
                title, author = extract_author_title(file_path)
                if author != 'auteur inconnu':
                    data_masculins.append([title, author, num_pages])

df = pd.DataFrame(data_masculins, columns=['Titre', 'Auteur', 'Nbr_pages'])
df.to_csv('../data/contexte/contexte_masculins.csv', index=False)

In [49]:
df_mixtes = pd.read_csv('../data/contexte/contexte_mixtes.csv')
df_feminins = pd.read_csv('../data/contexte/contexte_feminins.csv')
df_masculins = pd.read_csv('../data/contexte/contexte_masculins.csv')
df_mixtes

Unnamed: 0,Titre,Auteur,Nbr_pages
0,03,Professeur de Quatriéme au,152
1,03,Professeur a ‘a Faculté,152
2,03,professeur peut faire ranger,152
3,03,professeur 4a la Faculté,152
4,03,Professeur au lycée Montaigne.,152
5,03,professeur au lycée de,152
6,03,"Docteur és lettres, professeur",152
7,03,professeur se trouve en,152
8,03,professeur doit premierement enseigner,152
9,03,Professeur de Géographie coloniale,152


In [48]:
df_mixtes = df_mixtes.groupby('Titre').count().reset_index()
df_mixtes

Unnamed: 0,Titre,Auteur,Nbr_pages
0,01,2,2
1,02_part_1,3,3
2,02_part_3,2,2
3,03,27,27


In [6]:
def extract_years(content):
    return re.findall(r'\b(17[0-9]{2}|18[0-9]{2}|19[0-9]{2})', content)

def process_directory(directory, output_csv):
    data = []

    for subdirectory in os.listdir(directory):
        subdirectory_path = os.path.join(directory, subdirectory)
        if os.path.isdir(subdirectory_path):
            for filename in os.listdir(subdirectory_path):
                if filename.endswith('.txt'):
                    file_path = os.path.join(subdirectory_path, filename)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                    title = file_path.split('/')[4]
                    years = extract_years(content)
                    if years:
                        data.append([title, ', '.join(years)])

    df_years = pd.DataFrame(data, columns=['Titre', 'Années'])
    df_years.to_csv(output_csv, index=False)

# Process each directory and create the corresponding CSV files
process_directory('../data/txt/ouvrages_mixtes', '../data/contexte/years_mixtes.csv')
process_directory('../data/txt/ouvrages_feminins', '../data/contexte/years_feminins.csv')
process_directory('../data/txt/ouvrages_masculins', '../data/contexte/years_masculins.csv')