In [1]:
from datasets import load_dataset
import ast

dataset = load_dataset("hipe/2022-data", dataset='hipe2020', language='fr')

# Convert the 'metadata' string to a dictionary for each entry in the dataset
for i, item in enumerate(dataset['train']):
    metadata_string = item['metadata']
    metadata_dict = ast.literal_eval(metadata_string)
    dataset['train'][i]['metadata'] = metadata_dict

# Example usage
print(dataset['train'][0]['sentences']['tokens'][0])  # Tokens of the first sentence in the first article
print(dataset['train'][0]['sentences']['NE-COARSE-LIT'][0])  # Coarse labels for the tokens in the first sentence
print(dataset['train'][0]['metadata'])  # Metadata as a dictionary


['NOUVELLES', 'SUISSES', '—', 'En', '1887', ',', 'la', 'Société', 'suisse', 'du', 'Grutli', 's', "'", 'est', 'accrue', 'de', '40', 'sections', ';', 'l', "'", 'association', 'compte', 'actuellement', '12,000', 'membres']
['O', 'O', 'O', 'B-time', 'I-time', 'O', 'O', 'B-org', 'I-org', 'I-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
{'date': '1888-01-09', 'language': 'fr', 'document_type': 'newspaper', 'dataset': 'hipe2020', 'original_source': 'v1.4.1/fr/HIPE-data-v1.4.1-train-fr.tsv', 'doi': 'https://zenodo.org/record/6046853', 'version': 'v1.4', 'original_license': 'CC-BY-NC-SA 4.0', 'publication_title': 'EXP'}


In [2]:
WHITESPACE_RULES = {
    "fr": {
        "punctuation_nows_before": [".", ",", ")", "]", "}", "°", "..."],
        "punctuation_nows_after": ["(", "[", "{"],
        "punctuation_nows_beforeafter": ["'", "-"],
        "punctuation_ciffre": [".", ","],
    }
}

def reglue_tokenized_text(tokens: list[str]) -> str:
    """Reglue a tokenized text back together.

    Args:
        tokens (list[str]): List of tokens.
        language (str): Language of the text.

    Returns:
        str: Reglued text.
    """
    if not tokens:
        return ""

    # if language not in WHITESPACE_RULES:
    #     # Default behavior for languages without specific rules: join with space
    #     return " ".join(tokens)

    wsrules = WHITESPACE_RULES["fr"]
    text = tokens[0]

    for i in range(1, len(tokens)):
        prev_token = tokens[i - 1]
        curr_token = tokens[i]
        insert_ws = True

        if (
            prev_token in wsrules["punctuation_nows_beforeafter"]
            or curr_token in wsrules["punctuation_nows_beforeafter"]
        ):
            insert_ws = False

        elif curr_token in wsrules["punctuation_nows_before"]:
            insert_ws = False

        elif prev_token in wsrules["punctuation_nows_after"]:
            insert_ws = False

        elif (
            prev_token in wsrules["punctuation_ciffre"]
            and i >= 2
            and tokens[i - 2].isdigit()
            and curr_token.isdigit()
        ):
            insert_ws = False

        text += " " + curr_token if insert_ws else curr_token

    return text


In [3]:
from transformers import Pipeline
import numpy as np
import torch
from nltk.chunk import conlltags2tree
from nltk import pos_tag
from nltk.tree import Tree
import string
import torch.nn.functional as F
import re

In [4]:
from wikidata.client import Client

client = Client()
entity = client.get('Q70', load=True)

url = entity.data['sitelinks']['frwiki']['url']

print(url)

https://fr.wikipedia.org/wiki/Berne


In [14]:
def get_entities(tokens, tags, nel_lit, nel_meto):
    """postprocess the outputs here, for example, convert predictions to labels
    [
        {
            "entity": "B-org.ent.pressagency.AFP",
            "score": 0.99669313,
            "index": 13,
            "word": "AF",
            "start": 43,
            "end": 45,
        },
        {
            "entity": "I-org.ent.pressagency.AFP",
            "score": 0.42747754,
            "index": 14,
            "word": "##P",
            "start": 45,
            "end": 46,
        },
    ]
    [[('AFP', 'org.ent.pressagency.AFP', (12, 13), (47, 50))]]
    """
    tags = [tag.replace("S-", "B-").replace("E-", "I-") for tag in tags]
    pos_tags = [pos for token, pos in pos_tag(tokens)]

    conlltags = [(token, pos, tg) for token, pos, tg in zip(tokens, pos_tags, tags)]
    ne_tree = conlltags2tree(conlltags)

    entities = []
    idx: int = 0

    for subtree in ne_tree:
        if isinstance(subtree, Tree):
            original_label = subtree.label()
            original_string = " ".join([token for token, pos in subtree.leaves()])
            
            if nel_lit[idx] not in ['NIL', '_']:
                qid = nel_lit[idx]
                try:
                    entity = client.get(qid, load=True)
                    correct_name = entity.data['labels']['fr']['value']
                except Exception as ex:
                    print('Could not decode', qid, original_string, '---', ex)
                    
                    correct_name = original_string
                entities.append(
                    {
                        "index": idx,
                        "entity": original_label,
                        "word": original_string,
                        "qid": qid,
                        "wikipedia": correct_name
                    }
                )
            else:
                entities.append(
                    {
                        "index": idx,
                        "entity": original_label,
                        "word": original_string,
                    }
                )
            idx += len(subtree)

            # Update the current character position
            # We add the length of the original string + 1 (for the space)
        else:
            token, pos = subtree
            # If it's not a named entity, we still need to update the character
            # position
            idx += 1

    return entities

In [15]:
def get_detail(original_string, entities, detail_type):
    """
    Get the function for a person from a list of entities.

    :param original_string: The original string representing a person.
    :param entities: A list of entities, where each entity is represented as
                     a list [text, type, start_position, end_position].
    :return: The function of the person if found, otherwise None.
    """
#     {'index': 18, 'entity': 'time', 'word': 'du 17 novembre 1885'}
#     print(entities)
#     print(original_string, detail_type)
    
    for entity in entities:
        entity_text, entity_type = entity['word'], entity['entity']
        if detail_type in entity_type and entity_text in original_string:
#             print(entity['word'])
#             print('-'*30)
            return entity['word']
#     print('-'*30)
    return None

In [16]:
# import ast

# for document in dataset['train']:
#     sentences = document['sentences']
#     metadata = ast.literal_eval(document['metadata'])
#     print(type(metadata))
    
#     # Extract the date from the metadata
#     date = metadata.get('date', 'Unknown Date')
    
#     for tokens, coarse_lit, coarse_meto, fine_lit, file_meto, fine_comp, ne_nested in zip(sentences['tokens'], 
#                              sentences['NE-COARSE-LIT'], sentences['NE-COARSE-METO'], sentences['NE-FINE-LIT'], 
#                              sentences['NE-FINE-METO'], sentences['NE-FINE-COMP'], sentences['NE-NESTED']):
#         entities = [get_entities(tokens, tags) for tags in [coarse_lit, coarse_meto, fine_lit, file_meto, fine_comp, ne_nested]]

#         print(f"Date: {date}, Entities: {entities}")

In [17]:
import ast
from collections import defaultdict
from tqdm import tqdm

# Assuming dataset and get_entities are already defined

# Dictionaries to store entity statistics
entities_per_year = defaultdict(int)
entities_per_decade = defaultdict(int)
entity_decade_appearances = defaultdict(set)
entity_year_appearances = defaultdict(set)  # To track in which years each entity appears
unique_entity_types = set()  # To track unique entity types

for document in tqdm(dataset['train'], total=len(dataset['train'])):
    metadata = ast.literal_eval(document['metadata'])
    date = metadata.get('date')
    
    if not date:
        continue  # Skip documents without a date

    year = int(date[:4])
    decade = year - (year % 10)

    sentences = document['sentences']
    entities = []
    for tokens, coarse_lit, coarse_meto, fine_lit, fine_meto, fine_comp, ne_nested, nel_lit, nel_meto in zip(
        sentences['tokens'], sentences['NE-COARSE-LIT'], sentences['NE-COARSE-METO'],
        sentences['NE-FINE-LIT'], sentences['NE-FINE-METO'], sentences['NE-FINE-COMP'], sentences['NE-NESTED'],
        sentences['NEL-LIT'], sentences['NEL-METO']
    ):
        for tags in [fine_lit, fine_meto, fine_comp, ne_nested]:
            entities += get_entities(tokens, tags, nel_lit, nel_meto)

    for entity in entities:
        entity_type = entity['entity']
        entity_name = entity['word']
        unique_entity_types.add(entity_type)  # Track the entity type

        function, name = None, None
        if "pers" in entity_type:
            function = get_detail(entity_name, entities, "function")
            name = get_detail(entity_name, entities, "name")

        entities_per_year[(entity_name, year, entity_type, function, name)] += 1
        entities_per_decade[(entity_name, decade, entity_type, function, name)] += 1
        entity_decade_appearances[entity_name].add(decade)
        entity_year_appearances[entity_name].add(year)

# Print statistics
# print("Entities per year:")
# for (entity, year, entity_type, function, name), count in entities_per_year.items():
#     print(f"{entity} in {year} ({entity_type}, {function}, {name}): {count}")

# print("Entities per decade:")
# for (entity, decade, entity_type, function, name), count in entities_per_decade.items():
#     print(f"{entity} in {decade}s ({entity_type}, {function}, {name}): {count}")

# print("Entities mentioned in multiple decades:")
# for entity, decades in entity_decade_appearances.items():
#     if len(decades) > 1:
#         print(f"{entity} appears in decades: {sorted(decades)}")

# print("Entities mentioned in multiple years:")
# for entity, years in entity_year_appearances.items():
#     if len(years) > 1:
#         print(f"{entity} appears in years: {sorted(years)}")

# Print unique entity types
print("Unique entity types:")
for entity_type in unique_entity_types:
    print(entity_type)


  0%|                                                                                                                                                                                     | 0/158 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [None]:
print("Entities mentioned in multiple years:")
for entity, years in entity_year_appearances.items():
    if len(years) > 1:
        entity = reglue_tokenized_text(entity.split())
        print(f"{entity} appears in years: {sorted(years)}")


In [None]:
import matplotlib.pyplot as plt

# Filter entities mentioned in multiple years and prepare data for plotting
entity_year_counts = {}
for (entity, year, entity_type, name, function), count in entities_per_year.items():
    
    if entity in entity_year_appearances and len(entity_year_appearances[entity]) > 1:
        entity = reglue_tokenized_text(entity.split())
        if len(entity) > 2:
            if count > 5:
                if entity not in entity_year_counts:
                    entity_year_counts[entity] = {}
                entity_year_counts[entity][year] = count

# Plotting each entity's mentions per year
plt.figure(figsize=(15, 10))
for entity, year_counts in entity_year_counts.items():
    years = sorted(year_counts.keys())
    counts = [year_counts[year] for year in years]
    plt.plot(years, counts, marker='o', label=entity)

plt.xlabel('Year')
plt.ylabel('Number of Mentions')
plt.title('Entities Mentioned in Multiple Years')
plt.legend(title='Entities', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
places = []
for (entity, year, entity_type, name, function), count in entities_per_year.items():
    if 'loc' in entity_type:
        places.append(entity)
        
print(list(set(places)))

In [None]:
persons = []
for (entity, year, entity_type, name, function), count in entities_per_year.items():
    if 'pers' in entity_type:
        persons.append((entity, year, entity_type, name, function))
        
for person in list(set(persons)):
    print(person)

In [None]:
entity.data.keys()

In [None]:
entity = client.get('Q70', load=True)
correct_name = entity.data['labels']['fr']['value']

In [18]:
import pandas as pd
import ast
from tqdm import tqdm

# Assuming dataset and get_entities are already defined

# List to collect all the row data
data = []

for document in tqdm(dataset['train'], total=len(dataset['train'])):
    metadata = ast.literal_eval(document['metadata'])
    date = metadata.get('date')
    source = metadata.get('publication_title')
    language = metadata.get('language')
    doc_type = metadata.get('document_type')

    if not date:
        continue  # Skip documents without a date

    year = int(date[:4])
    decade = year - (year % 10)
    sentences_data = document['sentences']
    article_text = " ".join([" ".join(sentence) for sentence in sentences_data['tokens']])  # Concatenate all sentences

    for tokens, coarse_lit, coarse_meto, fine_lit, fine_meto, fine_comp, ne_nested, nel_lit, nel_meto in zip(
        sentences_data['tokens'], sentences_data['NE-COARSE-LIT'], sentences_data['NE-COARSE-METO'],
        sentences_data['NE-FINE-LIT'], sentences_data['NE-FINE-METO'], sentences_data['NE-FINE-COMP'], sentences_data['NE-NESTED'],
        sentences_data['NEL-LIT'], sentences_data['NEL-METO']
    ):
        sentence_text = " ".join(tokens)
        entities = []
        for tags in [fine_lit, fine_meto, fine_comp, ne_nested]:
            entities += get_entities(tokens, tags, nel_lit, nel_meto)

        # Append data for each sentence in the document
        data.append({
            'date': date,
            'source': source,
            'language': language,
            'doc_type': doc_type,
            'year': year,
            'decade': decade,
            'sentence': sentence_text,
            'entities': entities,
            'article': article_text
        })

# Create a DataFrame
df = pd.DataFrame(data)

  0%|                                                                                                                                                                                     | 0/158 [00:00<?, ?it/s]

Could not decode Q1226218 Ortafrontière --- 'fr'
Could not decode Q5553647 Gessler --- 'fr'


  4%|██████▍                                                                                                                                                                    | 6/158 [04:03<1:28:06, 34.78s/it]

Could not decode Q22389485 Signal --- 'fr'


  6%|██████████▊                                                                                                                                                               | 10/158 [06:00<1:26:18, 34.99s/it]

Could not decode Q22386089 Molard --- 'fr'
Could not decode Q22386089 Molard --- 'fr'


  7%|███████████▊                                                                                                                                                              | 11/158 [07:18<1:57:52, 48.11s/it]

Could not decode Q3646917 Gouille --- 'fr'
Could not decode Q22532416 Sonmartel --- 'fr'


  8%|████████████▉                                                                                                                                                             | 12/158 [07:28<1:28:46, 36.48s/it]

Could not decode Q2543411 fabrique de peignes Walter S . A --- 'fr'
Could not decode Q2543411 Walter S . A --- 'fr'


  9%|████████████████▏                                                                                                                                                         | 15/158 [08:42<1:10:55, 29.76s/it]

Could not decode Q7400620 M . Sugimura , représentant du Japon --- 'fr'
Could not decode Q7400620 M . --- 'fr'
Could not decode Q7400620 Sugimura --- 'fr'
Could not decode Q7400620 représentant du Japon --- 'fr'
Could not decode Q7400620 Japon --- 'fr'


 11%|██████████████████▎                                                                                                                                                       | 17/158 [11:40<2:03:21, 52.49s/it]

Could not decode Q6455769 L ' Indépendance roumaine --- 'fr'
Could not decode Q7807661 Timpul --- 'fr'
Could not decode Q518617 Reich --- 'fr'
Could not decode Q518617 Reich --- 'fr'


 16%|██████████████████████████▉                                                                                                                                               | 25/158 [16:46<1:10:24, 31.76s/it]

Could not decode Q71715484 René BRXICHET --- 'fr'


 18%|███████████████████████████████▌                                                                                                                                            | 29/158 [18:37<57:29, 26.74s/it]

Could not decode Q21480909 gouvernement despotique de Marcos - Perez Jimenez --- HTTP Error 404: Not Found
Could not decode Q21480909 Marcos - Perez Jimenez --- HTTP Error 404: Not Found
Could not decode Q21480909 gouvernement Perez Jimenez --- HTTP Error 404: Not Found
Could not decode Q21480909 Perez Jimenez --- HTTP Error 404: Not Found
Could not decode Q24904677 Radio - New - York --- 'fr'
Could not decode Q24904677 New - York --- 'fr'


 28%|███████████████████████████████████████████████▎                                                                                                                          | 44/158 [27:19<1:20:03, 42.14s/it]

Could not decode Q56540279 Planckaert --- 'fr'
Could not decode Q14849524 Bosberg --- 'fr'


 43%|█████████████████████████████████████████████████████████████████████████▏                                                                                                | 68/158 [38:12<1:00:16, 40.18s/it]

Could not decode Q692253 Wagram --- 'fr'


 47%|████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 74/158 [42:30<59:53, 42.78s/it]

Could not decode Q56399581 Georges II --- 'fr'
Could not decode Q56399581 Georges --- 'fr'
Could not decode Q56399581 II --- 'fr'


 51%|██████████████████████████████████████████████████████████████████████████████████████                                                                                    | 80/158 [47:59<1:31:32, 70.42s/it]

Could not decode Q22701451 dumontCenere --- 'fr'
Could not decode Q22701451 dumontCenere --- 'fr'


 51%|███████████████████████████████████████████████████████████████████████████████████████▏                                                                                  | 81/158 [50:00<1:49:47, 85.56s/it]

Could not decode Q7736796 Globe --- 'fr'


 52%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 82/158 [51:00<1:38:47, 78.00s/it]

Could not decode Q3226542 tunnel du Hauenstein --- 'fr'
Could not decode Q3226542 Hauenstein --- 'fr'


 56%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                            | 88/158 [53:44<42:26, 36.37s/it]

Could not decode Q22468133 Finstermunz --- 'fr'
Could not decode Q22468133 Finstermunz --- 'fr'


 56%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 89/158 [55:45<1:10:58, 61.72s/it]

Could not decode Q8077554 Tchataldja --- 'fr'


 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 94/158 [1:00:23<48:25, 45.41s/it]

Could not decode Q1196981 Sozialdemokrat --- 'fr'
Could not decode Q15732510 Intelligenzblalt --- 'fr'
Could not decode Q1196981 Sozialdemokral --- 'fr'
Could not decode Q1196981 Sozialdemokrat --- 'fr'
Could not decode Q1196981 Sozialdemokrat --- 'fr'
Could not decode Q1196981 Sozutkkmokrat --- 'fr'


 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 96/158 [1:02:18<53:33, 51.84s/it]

Could not decode Q1807425 Club national libéral de Londres --- 'fr'
Could not decode Q1807425 Londres --- 'fr'
Could not decode Q7848267 Trulh --- 'fr'
Could not decode Q5095721 Grand - Opéra --- 'fr'


 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 98/158 [1:06:12<1:18:09, 78.15s/it]

Could not decode Q8527698 Fisistock --- 'fr'
Could not decode Q1495421 vallée de Gastern --- 'fr'
Could not decode Q1495421 Gastern --- 'fr'
Could not decode Q15194031 Ausserberg --- 'fr'


 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 99/158 [1:06:51<1:05:27, 66.57s/it]

Could not decode Q60174481 Dovere --- 'fr'
Could not decode Q1529377 Glasbrunnen --- 'fr'
Could not decode Q29015698 col d ' Orsirora --- 'fr'
Could not decode Q29015698 Orsirora --- 'fr'


 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                        | 105/158 [1:11:30<33:11, 37.58s/it]

Could not decode Q18628882 Lutzelmurg --- 'fr'
Could not decode Q22702345 Wilhof --- 'fr'
Could not decode Q18628882 Lù ! zel - Murg --- 'fr'
Could not decode Q18628882 Lulzel - Murg --- 'fr'


 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 107/158 [1:12:30<28:47, 33.88s/it]

Could not decode Q28112686 Deutsche Tageszeitung --- 'fr'
Could not decode Q18028808 Tœgllche Rundschau --- 'fr'


 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 113/158 [1:16:07<23:39, 31.55s/it]

Could not decode Q13638293 Union internationale de secours aux enfants --- 'fr'
Could not decode Q13638293 U --- 'fr'
Could not decode Q13638293 I --- 'fr'
Could not decode Q13638293 S . E . --- 'fr'
Could not decode Q16524004 Dr Doxiadès , ancien ministre , président de la Ligue patriotique d ' assistance aux enfants --- 'fr'
Could not decode Q13638293 U --- 'fr'
Could not decode Q16524004 Dr --- 'fr'
Could not decode Q16524004 Doxiadès --- 'fr'
Could not decode Q16524004 ancien ministre --- 'fr'
Could not decode Q16524004 président de la Ligue patriotique d ' assistance aux enfants --- 'fr'
Could not decode Q16524004 Ligue patriotique d ' assistance aux enfants --- 'fr'
Could not decode Q13638293 I --- 'fr'
Could not decode Q13638293 S . E . --- 'fr'


 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                               | 114/158 [1:16:22<19:39, 26.81s/it]

Could not decode Q13638293 Union internationale de secours aux enfants --- 'fr'


 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 117/158 [1:18:44<33:16, 48.69s/it]

Could not decode Q31975594 Dresde --- 'fr'


 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                          | 118/158 [1:19:29<31:33, 47.33s/it]

Could not decode Q55367360 Christian ; Kaufmann --- 'fr'


 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 121/158 [1:22:35<33:07, 53.71s/it]

Could not decode Q1562949 H --- 'fr'
Could not decode Q1562949 C . Olten --- 'fr'
Could not decode Q1562949 Olten --- 'fr'
Could not decode Q2679675 Stade Lausanwe --- 'fr'
Could not decode Q2679675 Lausanwe --- 'fr'
Could not decode Q2679675 Stade Lausanne --- 'fr'
Could not decode Q2679675 Lausanne --- 'fr'
Could not decode Q2679675 Stade --- 'fr'
Could not decode Q2679675 Stade --- 'fr'
Could not decode Q2679675 Stade --- 'fr'
Could not decode Q2679675 Stade --- 'fr'
Could not decode Q2679675 Stade Lausanne --- 'fr'
Could not decode Q2679675 Stade --- 'fr'
Could not decode Q2679675 Stade --- 'fr'
Could not decode Q28231423 Urania --- 'fr'
Could not decode Q2679675 Stade Lausanne --- 'fr'
Could not decode Q2679675 Lausanne --- 'fr'


 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 123/158 [1:24:42<35:05, 60.16s/it]

Could not decode Q1632773 L ' Aixmt - Garde --- 'fr'


 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                 | 127/158 [1:25:45<13:11, 25.54s/it]

Could not decode Q27924254 Znamia --- 'fr'
Could not decode Q21641201 Literatournaïa Moskva --- 'fr'
Could not decode Q21641201 Moskva --- 'fr'


 87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 138/158 [1:30:53<08:02, 24.12s/it]

Could not decode Q14214519 partie orientale du Cambodge --- 'fr'
Could not decode Q14214519 Cambodge --- 'fr'


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 139/158 [1:32:03<12:00, 37.94s/it]

Could not decode Q28101962 Mundschin --- 'fr'
Could not decode Q28101962 Mundschin --- 'fr'


 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 143/158 [1:34:42<08:00, 32.05s/it]

Could not decode Q14685139 Vail --- 'fr'
Could not decode Q14685139 Vail --- 'fr'


 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 144/158 [1:35:03<06:43, 28.82s/it]

Could not decode Q1566819 Infranor --- 'fr'
Could not decode Q682685 BSI --- 'fr'
Could not decode Q2526106 Villars --- 'fr'
Could not decode Q30273339 Œrlikon Buchrle --- 'fr'


 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 145/158 [1:36:42<10:46, 49.76s/it]

Could not decode Q32066354 Kobel --- 'fr'


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 158/158 [1:40:25<00:00, 38.14s/it]


In [19]:
df.to_csv('../world-models/data/entity_datasets/hipe-fr.csv')

In [20]:
df

Unnamed: 0,date,source,language,doc_type,year,decade,sentence,entities,article
0,1888-01-09,EXP,fr,newspaper,1888,1880,"NOUVELLES SUISSES — En 1887 , la Société suiss...","[{'index': 3, 'entity': 'time.date.abs', 'word...","NOUVELLES SUISSES — En 1887 , la Société suiss..."
1,1888-01-09,EXP,fr,newspaper,1888,1880,"Environ 30,000 fr . ont été versés dans la cai...",[],"NOUVELLES SUISSES — En 1887 , la Société suiss..."
2,1888-01-09,EXP,fr,newspaper,1888,1880,Ecoles de fonctionnaires,[],"NOUVELLES SUISSES — En 1887 , la Société suiss..."
3,1888-01-09,EXP,fr,newspaper,1888,1880,— On parle de fonder deux facultés de sciences...,"[{'index': 15, 'entity': 'loc.adm.town', 'word...","NOUVELLES SUISSES — En 1887 , la Société suiss..."
4,1888-01-09,EXP,fr,newspaper,1888,1880,Chemins de fer,[],"NOUVELLES SUISSES — En 1887 , la Société suiss..."
...,...,...,...,...,...,...,...,...,...
5674,2018-01-03,IMP,fr,newspaper,2018,2010,Quelques courts ex - traits un peu au hasard,[],COURTS EXTRAITS Le carnet de bord des quatre b...
5675,2018-01-03,IMP,fr,newspaper,2018,2010,A l ’ entrée en Bulgarie : « La douane se pass...,"[{'index': 5, 'entity': 'loc.adm.nat', 'word':...",COURTS EXTRAITS Le carnet de bord des quatre b...
5676,2018-01-03,IMP,fr,newspaper,2018,2010,Sur un bateau au large de Bahreïn : « Les port...,"[{'index': 6, 'entity': 'loc.adm.nat', 'word':...",COURTS EXTRAITS Le carnet de bord des quatre b...
5677,2018-01-03,IMP,fr,newspaper,2018,2010,Impossible de leur faire comprendre qu ’ en Eu...,"[{'index': 8, 'entity': 'loc.adm.sup', 'word':...",COURTS EXTRAITS Le carnet de bord des quatre b...


In [21]:
df.iloc[0].entities

[{'index': 3, 'entity': 'time.date.abs', 'word': 'En 1887'},
 {'index': 7,
  'entity': 'org.ent',
  'word': 'Société suisse du Grutli',
  'qid': 'Q683672',
  'wikipedia': 'Société du Grütli'},
 {'index': 10,
  'entity': 'loc.phys.geo',
  'word': 'Grutli',
  'qid': 'Q683672',
  'wikipedia': 'Société du Grütli'}]

In [25]:
df.iloc[-1].sentence

'» Chez le médecin en Inde : « Il y a un dispensaire , avec un toubib indigène , genre charlatan , nous y amenons « Scipion » ; il fait une ordonnance en hindou , la passe à son pharmacien assis un peu plus loin entouré de flacons de poudres de toutes les couleurs ( . . . ) »'

In [50]:
import wikipediaapi

def get_historical_events(year, month, day, language):
    # Initialize the Wikipedia API with the English language
    wiki_wiki = wikipediaapi.Wikipedia('HistoricalEvents (emanuela.boros@gmail.com)', language)
    
    # Construct the page title in the format "Month Day"
    date = f"{month} {day}"
    
    # Try to get the Wikipedia page for the given date
    page = wiki_wiki.page(date)
    
    if not page.exists():
        return "No page found for this date."
    
    # Extract the content of the page
    text = page.text
    
    # Split the text into lines and filter for the specific year
    events = [line for line in text.split('\n') if line.startswith(str(year))]
    
    return events if events else "No historical events found for this date."


In [39]:
# Example usage
year = 1969
month = "July"
day = 20
events = get_historical_events(year, month, day)
print(events)

["1969 – Apollo program: Apollo 11's crew successfully makes the first human landing on the Moon in the Sea of Tranquility. Americans Neil Armstrong and Buzz Aldrin become the first humans to walk on the Moon six and a half hours later.", '1969   – A cease fire is announced between Honduras and El Salvador, six days after the beginning of the "Football War".', '1969 – Josh Holloway, American actor', '1969   – Kreso Kovacec, Croatian-German footballer', '1969   – Giovanni Lombardi, Italian cyclist', '1969   – Joon Park, South Korean-American singer', '1969   – Tobi Vail, American singer and guitarist', '1969   – Vitamin C, American singer-songwriter']


In [54]:
import pandas as pd
import wikipediaapi
from datetime import datetime

# Wikipedia API setup with language support
def get_historical_events(year, month, day, lang='en'):
    wiki_wiki = wikipediaapi.Wikipedia('HistoricalEvents (emanuela.boros@gmail.com)', lang)
    
    # Ensure correct date format for French and German
    if lang == 'fr':
        date = f"{day} {month.lower()}"  # Day first, month lowercase
    elif lang == 'de':
        date = f"{day}. {month}"  # Day followed by a period
    else:
        date = f"{month} {day}"  # Month first for English
    
    page = wiki_wiki.page(date)
    if not page.exists():
        return f"No page found for this date in {lang}."
    
    text = page.text
    events = [line for line in text.split('\n') if line.startswith(str(year))]
    return events if events else "No historical events found for this date."

# Maps for translating English month names to French and German
month_map_fr = {
    'January': 'Janvier', 'February': 'Février', 'March': 'Mars',
    'April': 'Avril', 'May': 'Mai', 'June': 'Juin',
    'July': 'Juillet', 'August': 'Août', 'September': 'Septembre',
    'October': 'Octobre', 'November': 'Novembre', 'December': 'Décembre'
}
month_map_de = {
    'January': 'Januar', 'February': 'Februar', 'March': 'März',
    'April': 'April', 'May': 'Mai', 'June': 'Juni',
    'July': 'Juli', 'August': 'August', 'September': 'September',
    'October': 'Oktober', 'November': 'November', 'December': 'Dezember'
}

# Iterate over DataFrame
dates = []
for _, item in df.iterrows():
    if item.date not in dates:
        dates.append(item.date)
        date_obj = datetime.strptime(item.date, '%Y-%m-%d')
        year = date_obj.year
        month = date_obj.strftime('%B')
        day = date_obj.day

        # Handle English
        events_en = get_historical_events(year, month, day, lang='en')
        print(f"English Events on {day} {month}:")
        print(events_en)
        
        # Handle French
        month_fr = month_map_fr[month]
        events_fr = get_historical_events(year, month_fr, day, lang='fr')
        print(f"French Events on {day} {month_fr}:")
        print(events_fr)

        # Handle German
        month_de = month_map_de[month]
        events_de = get_historical_events(year, month_de, day, lang='de')
        print(f"German Events on {day}. {month_de}:")
        print(events_de)


English Events on 9 January:
No historical events found for this date.
French Events on 9 Janvier:
No historical events found for this date.
German Events on 9. Januar:
No historical events found for this date.
English Events on 21 January:
['1908 – New York City passes the Sullivan Ordinance, making it illegal for women to smoke in public, only to have the measure vetoed by the mayor.']
French Events on 21 Janvier:
No historical events found for this date.
German Events on 21. Januar:
['1908: In Stockholm findet die Uraufführung des Kammerspiels Die Gespenstersonate von August Strindberg nach der gleichnamigen Sonate von Ludwig van Beethoven statt. Die Uraufführung floppt, das Stück wird erst vier Jahre nach dem Tod des Dichters ein Erfolg.', '1908: Am Theater an der Wien in Wien erfolgt die Uraufführung der Operette Der Mann mit den drei Frauen von Franz Lehár.', '1908: Raymond D. Gary, US-amerikanischer Politiker', '1908: Louise Rosenbaum, US-amerikanische Mathematikerin und Hochsch

German Events on 1. April:
['1938: In der Schweiz wird erstmals der von Nestlé hergestellte Instantkaffee Nescafé verkauft.', '1938: Erich Müller, Schweizer Industriemanager und Politiker', '1938: Ingrid Spors, deutsche Politikerin', '1938: Richard Du Moulin-Eckart, deutscher Historiker', '1938: Rafaela Serrano Rodríguez, kubanische Pianistin und Musikpädagogin spanischer Herkunft']
English Events on 11 May:
['1938 – Narendra Patel, Baron Patel, Tanzanian-English obstetrician, academic, and politician', '1938 – George Lyon, Canadian golfer and cricketer (b. 1858)']
French Events on 11 Mai:
['1938 : début de la construction du canal Rhin-Main-Danube.']
German Events on 11. Mai:
['1938: Sig Ohlemann, kanadischer Mittelstreckenläufer und Sprinter', '1938: Friedrich Eckenfelder, deutscher Maler', '1938: Jewgeni Karlowitsch Miller, General im Russischen Bürgerkrieg', '1938: Friedrich Knutzen, deutscher Politiker']
English Events on 21 June:
['1938 – Don Black, English songwriter', '1938   –

KeyboardInterrupt: 

In [96]:
entities[0]

{'index': 5, 'entity': 'loc.adm.nat', 'word': 'Inde', 'qid': 'Q668'}

In [80]:
df.head()

Unnamed: 0,date,source,language,doc_type,year,decade,sentence,entities,article
0,1888-01-09,EXP,fr,newspaper,1888,1880,"NOUVELLES SUISSES — En 1887 , la Société suiss...","[{'index': 3, 'entity': 'time.date.abs', 'word...","NOUVELLES SUISSES — En 1887 , la Société suiss..."
1,1888-01-09,EXP,fr,newspaper,1888,1880,"Environ 30,000 fr . ont été versés dans la cai...",[],"NOUVELLES SUISSES — En 1887 , la Société suiss..."
2,1888-01-09,EXP,fr,newspaper,1888,1880,Ecoles de fonctionnaires,[],"NOUVELLES SUISSES — En 1887 , la Société suiss..."
3,1888-01-09,EXP,fr,newspaper,1888,1880,— On parle de fonder deux facultés de sciences...,"[{'index': 15, 'entity': 'loc.adm.town', 'word...","NOUVELLES SUISSES — En 1887 , la Société suiss..."
4,1888-01-09,EXP,fr,newspaper,1888,1880,Chemins de fer,[],"NOUVELLES SUISSES — En 1887 , la Société suiss..."


In [81]:
df.iloc[0].entities

[{'index': 3, 'entity': 'time.date.abs', 'word': 'En 1887'},
 {'index': 7, 'entity': 'org.ent', 'word': 'Société suisse du Grutli'},
 {'index': 10, 'entity': 'loc.phys.geo', 'word': 'Grutli'}]

In [82]:
df.tail()

Unnamed: 0,date,source,language,doc_type,year,decade,sentence,entities,article
5674,2018-01-03,IMP,fr,newspaper,2018,2010,Quelques courts ex - traits un peu au hasard,[],COURTS EXTRAITS Le carnet de bord des quatre b...
5675,2018-01-03,IMP,fr,newspaper,2018,2010,A l ’ entrée en Bulgarie : « La douane se pass...,"[{'index': 5, 'entity': 'loc.adm.nat', 'word':...",COURTS EXTRAITS Le carnet de bord des quatre b...
5676,2018-01-03,IMP,fr,newspaper,2018,2010,Sur un bateau au large de Bahreïn : « Les port...,"[{'index': 6, 'entity': 'loc.adm.nat', 'word':...",COURTS EXTRAITS Le carnet de bord des quatre b...
5677,2018-01-03,IMP,fr,newspaper,2018,2010,Impossible de leur faire comprendre qu ’ en Eu...,"[{'index': 8, 'entity': 'loc.adm.sup', 'word':...",COURTS EXTRAITS Le carnet de bord des quatre b...
5678,2018-01-03,IMP,fr,newspaper,2018,2010,» Chez le médecin en Inde : « Il y a un dispen...,"[{'index': 5, 'entity': 'loc.adm.nat', 'word':...",COURTS EXTRAITS Le carnet de bord des quatre b...


In [None]:
# save in world_models/data/

In [None]:
import json
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Step 1: Load and organize the data by decade
def load_data(file_path):
    data_by_decade = defaultdict(list)
    with open(file_path, 'r') as file:
        for line in file:
            record = json.loads(line)
            time_period = record['time_period']  # Assuming there's a 'time_period' field
            text = record['text']  # Assuming there's a 'text' field containing the text data
            data_by_decade[time_period].append(text)
    return data_by_decade

# Step 2: Analyze lexical diversity and word frequency for each decade
def analyze_language(data_by_decade):
    for decade, texts in data_by_decade.items():
        all_words = [word for text in texts for word in word_tokenize(text.lower())]
        vocab = set(all_words)
        lexical_diversity = len(vocab) / len(all_words)
        freq_dist = FreqDist(all_words)

        print(f"Decade: {decade}")
        print(f"Lexical Diversity: {lexical_diversity:.4f}")
        print(f"Most Common Words: {freq_dist.most_common(10)}\n")

# Assuming you have a JSON Lines file 'data.jsonl' with text data and a 'time_period' field
file_path = 'data.jsonl'
data_by_decade = load_data(file_path)
analyze_language(data_by_decade)


In [None]:
import json
import nltk
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat.textstat import textstatistics

# Load and organize the data by decade
def load_data(file_path):
    data_by_decade = defaultdict(list)
    with open(file_path, 'r') as file:
        for line in file:
            record = json.loads(line)
            time_period = record['time_period']  # Assuming there's a 'time_period' field
            text = record['text']  # Assuming there's a 'text' field containing the text data
            data_by_decade[time_period].append(text)
    return data_by_decade

# Calculate average sentence length in words
def average_sentence_length(texts):
    sentences = [sentence for text in texts for sentence in sent_tokenize(text)]
    words = [word for text in texts for word in word_tokenize(text)]
    return len(words) / len(sentences)

# N-gram frequency analysis
def n_gram_analysis(texts, n=2):
    n_grams = nltk.ngrams(' '.join(texts).split(), n)
    freq_dist = FreqDist(n_grams)
    return freq_dist.most_common(10)

# Sentiment analysis
def sentiment_analysis(texts):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = [sia.polarity_scores(text) for text in texts]
    average_sentiment = defaultdict(float)
    for score in sentiment_scores:
        for key in score:
            average_sentiment[key] += score[key] / len(texts)
    return average_sentiment

# Readability analysis (example: Flesch reading ease)
def readability_analysis(texts):
    text = ' '.join(texts)
    return textstatistics().flesch_reading_ease(text)

# Analyze language for each decade
def analyze_language(data_by_decade):
    for decade, texts in data_by_decade.items():
        print(f"Decade: {decade}")
        print(f"Lexical Diversity: {len(set(word_tokenize(' '.join(texts)))) / len(word_tokenize(' '.join(texts))):.4f}")
        print(f"Average Sentence Length: {average_sentence_length(texts):.2f} words")
        print(f"Top Bigrams: {n_gram_analysis(texts)}")
        print(f"Sentiment Scores: {sentiment_analysis(texts)}")
        print(f"Flesch Reading Ease: {readability_analysis(texts):.2f}\n")

# Example usage
file


In [None]:
import torch
from torch import nn
from transformers import BertModel, BertConfig

class HistoricalLLM(nn.Module):
    def __init__(self):
        super(HistoricalLLM, self).__init__()
        # Initialize with a pre-trained model or your own configuration
        self.language_model = BertModel(BertConfig())
        # Adapters could be added here if needed

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        return self.language_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    def fine_tune(self, training_data_loader, learning_rate=1e-5):
        self.train()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        for epoch in range(num_epochs):
            for batch in training_data_loader:
                optimizer.zero_grad()
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                outputs = self(input_ids, attention_mask=attention_mask)
                loss = compute_loss(outputs, batch)  # Define compute_loss based on your task
                loss.backward()
                optimizer.step()

class TimeCapsuleAdapter:
    # This is the adapter we discussed earlier
    pass

# Example usage
llm = HistoricalLLM()
# Here you would attach an adapter, fine-tune the model, etc.

# Fine-tuning the model (assuming you have a DataLoader for your historical data)
# llm.fine_tune(training_data_loader)


In [None]:
import torch
from transformers import GPT2Model, GPT2Tokenizer

class TimeCapsuleAdapter:
    def __init__(self, model_name='gpt2', time_capsule_data=None):
        self.model = GPT2Model.from_pretrained(model_name)
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.time_capsule_data = time_capsule_data or {}

    def add_memory(self, period, memory_text):
        if period in self.time_capsule_data:
            self.time_capsule_data[period].append(memory_text)
        else:
            self.time_capsule_data[period] = [memory_text]

    def generate_from_period(self, period, prompt="", max_length=50):
        if period not in self.time_capsule_data:
            return "No memories available for this period."

        historical_context = ' '.join(self.time_capsule_data[period])
        full_prompt = f"{historical_context} {prompt}"
        input_ids = self.tokenizer.encode(full_prompt, return_tensors='pt')

        # Generating text from the model
        output_sequences = self.model.generate(
            input_ids=input_ids,
            max_length=max_length + len(input_ids[0]),
            temperature=0.9
        )

        generated_sequence = self.tokenizer.decode(output_sequences[0], skip_special_tokens=True)
        return generated_sequence[len(full_prompt):]  # Return only the generated part

# Usage example
time_capsule = TimeCapsuleAdapter()
time_capsule.add_memory('1990s', 'The rise of the internet. The emergence of mobile phones.')
generated_text = time_capsule.generate_from_period('1990s', prompt="In the late 90s,")
print(generated_text)


In [None]:
class PeriodicalAdapter:
    def __init__(self, model, period):
        self.model = model  # The period-specific trained model
        self.period = period  # The historical period this adapter handles

    def process(self, text):
        # Process the text using the model and return the result
        return self.model.generate(text)

class Router:
    def __init__(self, adapters):
        self.adapters = adapters  # A list of PeriodicalAdapter instances

    def route(self, text):
        period = self.analyze_period(text)  # Determine the period from the text
        for adapter in self.adapters:
            if adapter.period == period:
                return adapter.process(text)
        return "Period not covered"

    def analyze_period(self, text):
        # Analyze the text to determine the relevant time period
        # This function needs to be implemented based on specific criteria
        pass


In [None]:
def analyze_period(text):
    period_keywords = {
        '1800s': ['Industrial Revolution', 'Victorian Era'],
        '1900s': ['World War I', 'World War II', 'Cold War'],
        '2000s': ['Internet', 'smartphone', 'social media']
    }

    for period, keywords in period_keywords.items():
        for keyword in keywords:
            if keyword.lower() in text.lower():
                return period

    return "Unknown period"  # Default if no period-specific keywords are found


In [None]:
def determine_period(text, models):
    perplexities = {period: calculate_perplexity(model, text) for period, model in models.items()}
    # Find the period with the lowest perplexity
    best_period = min(perplexities, key=perplexities.get)
    return best_period

# Example usage
models = {
    '1800s': model_1800s,
    '1900s': model_1900s,
    '2000s': model_2000s
}

text = "The rise of the internet has transformed society."
period = determine_period(text, models)
print(f"The text likely belongs to the {period} period.")


In [63]:
# import geopy
# from geopy.geocoders import Nominatim
# from collections import defaultdict

# # Assuming you have a dataset where entities and their years are already extracted
# place_mentions_per_year = defaultdict(list)  # {year: [(place_name, entity)]}

# # Geolocator setup
# geolocator = Nominatim(user_agent="geoapiExercises")

# # Example dataset iteration
# for document in tqdm(dataset['train'], total=len(dataset['train'])):
#     metadata = ast.literal_eval(document['metadata'])
#     date = metadata.get('date')
    
#     if not date:
#         continue  # Skip documents without a date

#     year = int(date[:4])
#     decade = year - (year % 10)
    
#     sentences = document['sentences']
#     for tokens, coarse_lit, coarse_meto, fine_lit, fine_meto, fine_comp, ne_nested in zip(
#         sentences['tokens'], sentences['NE-COARSE-LIT'], sentences['NE-COARSE-METO'],
#         sentences['NE-FINE-LIT'], sentences['NE-FINE-METO'], sentences['NE-FINE-COMP'], sentences['NE-NESTED']
#     ):
#         for tags in [coarse_lit, coarse_meto, fine_lit, fine_meto, fine_comp, ne_nested]:
#             entities = get_entities(tokens, tags)
#             print(entities)
#             for entity in entities:
#                 if entity['entity'] == 'loc':  # Assuming 'type' field indicating the entity type
#                     location = geolocator.geocode(entity['word'], exactly_one=True)
#                     if location:
#                         lat, lon = location.latitude, location.longitude
#                         place_mentions_per_year[year].append((entity['word'], (lat, lon)))

# # Now place_mentions_per_year contains place names and their coordinates mentioned per year

# # Example: print places and their coordinates mentioned in 2020
# for place, coords in place_mentions_per_year[2020]:
#     print(f"{place} at coordinates {coords}")
