In [1]:
import pandas as pd
import random
import csv
import re
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup

The dataset was created by scraping three websites, where a total of 1101 sentences with the words mountain or mountains were collected. Subsequently, these words were replaced with the names of mountains (only lowercase mountain or mountains words, because some mountains were already present in the sentences, but it would be difficult to label them). The replacement of words was done in such a way as to maintain a balance of mountain names by randomly selecting a mountain name and keeping track of the number of mountains used. If all the mountain names were used, the tracker was updated and again randomly selected and recorded the used mountains. Subsequently, these mountains were labeled with bio-tags, where 'B-MOUNTAIN' stands for the beginning of the mountain name, or mountain (if the mountain name consists of one word or the tokenizer will not break it) and 'I-MOUNTAIN', the following words that refer to the mountain name (if the mountain consists of several words or the tokenizer breaks the mountain name) and 'O' - if it is another word, not mountain name


In [2]:
# Scraping sentences from the first website
url_1 = 'https://gikken.co/mate-translate/sentences/english/mountain'
response_1 = requests.get(url_1)  # Sending GET request
soup_1 = BeautifulSoup(response_1.text, 'html.parser')  # Parsing HTML response

# List to store sentences containing the word "mountain"
sentences_with_mountains_1 = []

# Extracting sentences from a specific div with the class 'info-block example-sentences'
for p_tag in soup_1.find('div', class_='info-block example-sentences').find_all('p'):
    for a_tag in p_tag.find_all('a'):  # Removing all anchor tags inside the paragraph
        a_tag.extract()
    sentences_with_mountains_1.append(p_tag.text.strip())  # Appending cleaned sentence text
len(sentences_with_mountains_1)  # Checking the number of sentences scraped

101

In [3]:
# Scraping sentences from the second website
url_2 = 'https://sentence.yourdictionary.com/mountain'
response_2 = requests.get(url_2)  # Sending GET request
soup_2 = BeautifulSoup(response_2.text, 'html.parser')  # Parsing HTML response

# List to store sentences containing the word "mountain"
sentences_with_mountains_2 = []

# Extracting sentences from paragraphs with the class 'sentence-item__text'
for sentence in soup_2.find_all('p', class_='sentence-item__text'):
    sentences_with_mountains_2.append(sentence.text.strip())  # Appending sentence text
len(sentences_with_mountains_2)  # Checking the number of sentences scraped

500

In [4]:
# Scraping sentences from the third website
url_3 = 'https://sentence.yourdictionary.com/mountains'
response_3 = requests.get(url_3)  # Sending GET request
soup_3 = BeautifulSoup(response_3.text, 'html.parser')  # Parsing HTML response

# List to store sentences containing the word "mountains"
sentences_with_mountains_3 = []

# Extracting sentences from paragraphs with the class 'sentence-item__text'
for sentence in soup_3.find_all('p', class_='sentence-item__text'):
    sentences_with_mountains_3.append(sentence.text.strip())  # Appending sentence text
len(sentences_with_mountains_3)  # Checking the number of sentences scraped

500

In [6]:
# Combining all scraped sentences into a single list
all_sentences_with_mountains = sentences_with_mountains_1 + sentences_with_mountains_2 + sentences_with_mountains_3

# Writing the sentences into a CSV file
output_file = "./data/all.csv"
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Sentence"])  # Writing header row
    for sentence in all_sentences_with_mountains:
        writer.writerow([sentence])  # Writing each sentence as a new row

In [10]:
mountains_names = ['Goverla',
    "Mount Everest", "K2", "Kangchenjunga", "Lhotse", "Makalu", 
    "Cho Oyu", "Dhaulagiri", "Manaslu", "Nanga Parbat", "Annapurna", 
    "Kilimanjaro", "Elbrus", "Fuji", "Denali", "Matterhorn", 
    "McKinley", "Rainier", "Whitney", "Mont Blanc", "Aconcagua", 
    "Roraima", "St. Helens", "Hood", "Kosciuszko", "Ararat", 
    "Vinson", "Erebus", "Vesuvius", "Etna", "Table Mountain", 
    "Adams", "Baker", "Goliath", "Taranaki", "Shasta", "Olympus", 
    "Toubkal", "Cervino", "Andes", "Huascarán", "Namjagbarwa", 
    "Bogda", "Trikora", "Damavand", "Musala", "Siguniang", 
    "Kinabalu", "Meru", "Puncak Jaya", "Kailash", "Rinjani", 
    "Pico", "Nanda Devi", "Khao Luang", "Holy Cross", "Waddington", 
    "Asgard", "Tambora", "Chimborazo", "Sorata", "Tien Shan", 
    "Krn", "Helicon", "Darwin", "Inyangani", "Sawtooth", 
    "Aiguille", "Takao", "Sonnblick", "Olomana", "Cuyamaca", 
    "Bardsey", "Foraker", "Crillon", "Meile", "Bastille", 
    "Perisher", "Selkirk", "Brazeau", "Cathedral", "Chirripó", 
    "Batur", "Okmok", "Wind River Mountains", "Alps", "Rockies", 
    "Pyrenees", "Carpathians", "Himalayas", "Caucasus Mountains",
    "Vanoise Massif", "Sierra Nevada", "Titicaca Mountains", "Cotswolds", 
    "Blue Ridge Mountains", "Cascade Range", "Appalachians", "Ural Mountains", 
    "Tian Shan", "Atlas Mountains", "Dolomites", "Great Smoky Mountains", 
    "Zagros Mountains", "Alborz Mountains", "Ayrshire Hills", "Beartooth Mountains", 
    "Sierra Madre", "Cordillera Blanca", "Sierra de Guadarrama", "Black Hills", 
    "Wasatch Range", "Saskatchewan Mountains", "Sierra de los Cuchumatanes", 
    "Sayan Mountains", "Giant Mountains", "Massif Central", "Pennines", 
    "Apennines", "White Mountains", "Chugach Mountains", "Tatra Mountains",
    "Pindus Mountains", "Fitz Roy", "Mount Cook", "Toubkal", "Sandia Mountains", 
    "Red Mountains", "Picos de Europa", "Wuling Mountains", "Cerro Torre", 
    "Cerro Fitz Roy", "Rangitoto", "Mount Wilhelm", "Yushan", "Lennox Hills",
    "Hindu Kush", "Nimba Mountains", "Khumbu Mountains", "Sierra Nevada de Santa Marta", 
    "Timor Mountains", "Kermadec Islands", "Hindu Kush Mountains", "Hubbard Glacier", 
    "Mount Lemmington", "Mount Apo", "Mount Khao Luang", "Vinales Mountains", 
    "Mount Tai", "Sierra de San Pedro Martir", "Yangtze Mountains", "Caribbean Mountains", 
    "Whistler Mountain", "Mount Banahaw", "Balkan Mountains", "Tianmu Mountains", 
    "Pico Duarte", "San Juan Mountains", "Alps", "Hohe Tauern", "Mauna Kea", 
    "Sangre de Cristo Mountains", "Punakha", "Sierra Madres", "Chugach Range", "Karakol Mountains",
    "Mount Fuji", "Mount Kilimanjaro", "Mount Everest", "Mount McKinley", "Mount Rainier",
    "Mount Denali", "Mount Blanc", "Mount Ararat", "Mount Elbrus", "Mount St. Helens",
    "Mount Hood", "Mount Taranaki", "Mount Shasta", "Mount Olympus", "Mount Puncak Jaya",
    "Mount Meru", "Mount Aconcagua", "Mount Kosciuszko", "Mount Whitney", "Mount Matterhorn",
    "Mount Fuego", "Mount Meru", "Mount Toubkal", "Mount Thor", "Mount Goliath", "Mount Adams",
    "Mount Cerro Torre", "Mount Fuji", "Mount Mount Cook", "Mount Vesuvius", "Mount Vinson",
    'Red Mountain', 'Blue Mountains', 'Rocky Mountains'
]



In [None]:
# Reading the CSV file into a pandas DataFrame and converting sentences to a list
all_data = pd.read_csv('./data/all.csv')
mountains_sentences = all_data['Sentence'].to_list()

# Function to replace keywords ('mountain', 'mountains') in sentences with random mountain names
def replace_with_mountains(sentences, mountain_names):
    """
    Replaces occurrences of 'mountain' or 'mountains' in sentences with random mountain names.

    Parameters:
        sentences (list): A list of sentences containing the keywords to be replaced.
        mountain_names (list): A list of mountain names to use as replacements.

    Returns:
        list: A list of sentences with keywords replaced by mountain names.
    """
    all_sentences = []  # List to store modified sentences
    keywords = ['mountain', 'mountains']  # Keywords to replace
    keyword_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')s?\b')  # Regex pattern for keywords
    used_mountains = set()  # Track used mountain names to avoid repetition

    for sentence in sentences:
        matches = list(keyword_pattern.finditer(sentence))  # Find all matches of keywords
        if matches:
            new_sentence = sentence
            for matching in reversed(matches):  # Replace matches from the end to maintain indices
                available_mountains = [m for m in mountain_names if m not in used_mountains]  # Filter unused mountain names
                if not available_mountains:  # If all names are used, reset the set
                    used_mountains.clear()
                    mountain = random.choice(mountain_names)  # Randomly pick a mountain name
                    used_mountains.add(mountain)
                else:
                    mountain = random.choice(available_mountains)  # Pick an unused mountain name
                    used_mountains.add(mountain)
                
                start, end = matching.span()  # Get start and end indices of the match
                new_sentence = new_sentence[:start] + mountain + new_sentence[end:]  # Replace the match with the mountain name
            
            all_sentences.append(new_sentence)  # Add modified sentence to the list
        else:
            all_sentences.append(sentence)  # If no match, add the original sentence
    
    return all_sentences

# Function to generate BIO tags for mountain names in sentences
def make_bio_tags(sentences, mountain_names):
    """
    Generates BIO tags for identifying mountain names in sentences.

    Parameters:
        sentences (list): A list of sentences containing mountain names.
        mountain_names (list): A list of mountain names to tag in the sentences.

    Returns:
        list: A list of BIO tags corresponding to the tokens in each sentence.
    """
    all_labels = []  # List to store BIO tags for all sentences

    for sentence in sentences:
        tokens = re.findall(r'\b\w+\b|\.', sentence)  # Tokenize sentence
        labels = ["O"] * len(tokens)  # Initialize all labels as "O" (outside)
        for mountain in mountain_names:
            mountain_tokens = re.findall(r'\b\w+\b|\.', mountain)  # Tokenize mountain name
            start_idx = None

            # Find the starting index of the mountain name in the tokens
            for i in range(len(tokens) - len(mountain_tokens) + 1):
                if tokens[i:i + len(mountain_tokens)] == mountain_tokens:
                    start_idx = i
                    break

            if start_idx is not None:  # If the mountain name is found in the sentence
                labels[start_idx] = "B-MOUNTAIN"  # Mark the start as "B-MOUNTAIN"
                for j in range(1, len(mountain_tokens)):
                    labels[start_idx + j] = "I-MOUNTAIN"  # Mark subsequent tokens as "I-MOUNTAIN"
        all_labels.append(labels)  # Add labels for the sentence
    
    return all_labels

In [8]:
# Splitting sentences into training and validation datasets
sentences = pd.read_csv('./data/all.csv')
train_sentences, val_sentences = train_test_split(sentences, test_size=0.2, random_state=42)  # 80-20 split

# Saving the split datasets to CSV files
train_sentences.to_csv('./data/train_data.csv', index=False)
val_sentences.to_csv('./data/val_data.csv', index=False)

In [11]:
# Reading back the split datasets
train = pd.read_csv('./data/train_data.csv')
val = pd.read_csv('./data/val_data.csv')

# Replacing keywords in sentences with random mountain names for training and validation datasets
train_replaced = replace_with_mountains(train['Sentence'].tolist(), mountains_names)
val_replaced = replace_with_mountains(val['Sentence'].tolist(), mountains_names)

# Generating BIO tags for the modified sentences
train_tags = make_bio_tags(train_replaced, mountains_names)
val_tags = make_bio_tags(val_replaced, mountains_names)

# Adding modified sentences and tags as new columns to the training and validation datasets
train['new_sentence'] = train_replaced
train['tags'] = train_tags
val['new_sentence'] = val_replaced
val['tags'] = val_tags

# Tokenizing the modified sentences and adding as a new column
train['tokens'] = train['new_sentence'].apply(lambda sentence: re.findall(r'\b\w+\b|\.', sentence))
val['tokens'] = val['new_sentence'].apply(lambda sentence: re.findall(r'\b\w+\b|\.', sentence))

# Saving the modified datasets to new CSV files
train.to_csv('./data/train_data_modified.csv', index=False)
val.to_csv('./data/val_data_modified.csv', index=False)