In [41]:
import os
import requests
import pandas as pd
import random
from bs4 import BeautifulSoup
import functools

from typing import Iterable, Callable, Any, Optional

pd.set_option('display.max_columns', 100)

In [42]:
# splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
# df = pd.read_parquet("hf://datasets/telord/ner-mountains-first-dataset/" + splits["train"])

In [43]:
# df.head()['labels']

In [44]:
def acquire_mountain_sentences(url: str, n_sentences: Optional[int]=None) -> Iterable[str]:
    '''
    potential sources:
    - https://sentence.yourdictionary.com/mountain
    - https://gikken.co/mate-translate/sentences/english/mountain
    '''
    
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        sentence_container = soup.find("div", class_="example-sentences")
        sentences = [p.get_text(strip=True) for p in sentence_container.find_all("p")]
        cleaned_sentences = [s.replace("Translate from English to English", "").strip() for s in sentences]
        return cleaned_sentences if n_sentences is None else cleaned_sentences[:n_sentences]
    
    return []

In [45]:
urls = [
    'https://gikken.co/mate-translate/sentences/english/mountain',
    'https://gikken.co/mate-translate/sentences/english/mountains']

# merge list of lists of sentences into single list of sentences
sentences = sum(list(map(acquire_mountain_sentences, urls)), [])

In [46]:
len(sentences), sentences[:10]

(202,
 ["We think it is very dangerous that you're climbing the mountain alone.",
  'Seen against the sky, the mountain looked really beautiful.',
  'The contrast between the sky and the mountain is striking.',
  'They hate him because he gives them a mountain of homework.',
  'John would often go mountain climbing when he was a student.',
  'We headed for the mountain cottage.',
  'We crawled like so many ants along the mountain pass.',
  'We stood on the top of the mountain.',
  'We climbed up the stream in the mountain.',
  'We had native guides on our trip to the mountain.'])

In [47]:
def random_replace_all(sentences, keywords: str, replacements):
    '''replace all keywords with random word from replacements'''
    
    def replace_keyword(sentences, keyword):
        '''special case of outer function'''
        return tuple(map(lambda s: s.lower().replace(keyword, random.choice(replacements)), sentences))

    return functools.reduce(replace_keyword, keywords, sentences)

In [59]:
# used chatgpt to generate mountain names tuple
mountain_names = (
    'Mount Everest', 'K2', 'Denali', 'Kangchenjunga', 'Lhotse', 'Makalu', 
    'Cho Oyu', 'Dhaulagiri', 'Manaslu', 'Annapurna', 'Nanga Parbat', 
    'Mount Elbrus', 'Mount Kilimanjaro', 'Mount Fuji', 'Matterhorn', 
    'Mont Blanc', 'Pico de Orizaba', 'Mount Rainier', 'Mount Aconcagua', 
    'Mount Vinson', 'Puncak Jaya')

not_mountain_names = (
    'mountains', 'mountain')

replacements = list(mountain_names) + list(not_mountain_names)

words_to_replace = ('mountains', 'mountain')

# keyword order matters
# mountain_sentences = random_replace_all(sentences, words_to_replace, mountain_names)
mountain_sentences = random_replace_all(sentences, words_to_replace, replacements)

In [60]:
mountain_sentences[:10]

("we think it is very dangerous that you're climbing the Cho Oyu alone.",
 'seen against the sky, the Manaslu looked really beautiful.',
 'the contrast between the sky and the mountains is striking.',
 'they hate him because he gives them a Mont Blanc of homework.',
 'john would often go Matterhorn climbing when he was a student.',
 'we headed for the Manaslu cottage.',
 'we crawled like so many ants along the mountains pass.',
 'we stood on the top of the Dhaulagiri.',
 'we climbed up the stream in the Nanga Parbat.',
 'we had native guides on our trip to the Mount Vinson.')

In [50]:
def save_sentences_to_csv(sentences, filename: str='sentences.csv') -> None:
    import csv
    
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
    
        for sentence in sentences:
            writer.writerow([sentence])
        
        print(f'successfully saved sentences to {filename}')

In [61]:
save_sentences_to_csv(mountain_sentences)

successfully saved sentences to sentences.csv


In [62]:
df = pd.read_csv('sentences.csv', header=None, names=['sentence'])

print(df.shape)
df.head()

(202, 1)


Unnamed: 0,sentence
0,we think it is very dangerous that you're clim...
1,"seen against the sky, the Manaslu looked reall..."
2,the contrast between the sky and the mountains...
3,they hate him because he gives them a Mont Bla...
4,john would often go Matterhorn climbing when h...


In [63]:
def annotate_mountains(sentence, mountains):
    '''annotate a single sentence with mountain names using B-MOUNT and I-MOUNT tags for BERT'''
    
    words = sentence.split()
    annotations = ["O"] * len(words)
    
    # filter to skip iteration if no mountain found in sentence
    for mountain_name in filter(lambda m: m in sentence, mountains):
        mountain_words = mountain_name.split()
        # acquire next (first) index of the word in sentence that matches whole mountain name
        start_index = next(
            (i for i, _ in enumerate(words) if words[i:i+len(mountain_words)] == mountain_words), 
            None  # to avoid stopiteration exception if iterator exhausted
        )
        
        if start_index is not None:
            for i in range(len(mountain_words)):
                # set B-MOUNT if its the first word of a mount name I-MOUNT otherwise
                annotations[start_index + i] = "B-MOUNT" if i == 0 else "I-MOUNT"
    
    return " ".join(annotations)

In [64]:
annotate_ = functools.partial(annotate_mountains, mountains=mountain_names)

df['annotation'] = df['sentence'].apply(annotate_)

In [65]:
df.head()

Unnamed: 0,sentence,annotation
0,we think it is very dangerous that you're clim...,O O O O O O O O O O B-MOUNT I-MOUNT O
1,"seen against the sky, the Manaslu looked reall...",O O O O O B-MOUNT O O O
2,the contrast between the sky and the mountains...,O O O O O O O O O O
3,they hate him because he gives them a Mont Bla...,O O O O O O O O B-MOUNT I-MOUNT O O
4,john would often go Matterhorn climbing when h...,O O O O B-MOUNT O O O O O O


In [66]:
save_path = 'sentences_annotated.csv'

df.to_csv(save_path)

In [67]:
new_df = pd.read_csv(save_path)

In [68]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,sentence,annotation
0,0,we think it is very dangerous that you're clim...,O O O O O O O O O O B-MOUNT I-MOUNT O
1,1,"seen against the sky, the Manaslu looked reall...",O O O O O B-MOUNT O O O
2,2,the contrast between the sky and the mountains...,O O O O O O O O O O
3,3,they hate him because he gives them a Mont Bla...,O O O O O O O O B-MOUNT I-MOUNT O O
4,4,john would often go Matterhorn climbing when h...,O O O O B-MOUNT O O O O O O
