In [10]:
# !pip install -r requirements.txt

In [24]:
# !pip show torch transformers pandas

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [13]:
# Test sentences generate via ChatGPT
sentences = [
    "The Himalayas are a majestic mountain range in Asia.",
    "Denali, formerly known as Mount McKinley, is the highest peak in North America.",
    "The Andes run through several South American countries, including Chile and Peru.",
    "The Rocky Mountains are a famous range in North America, stretching from Canada to New Mexico.",
    "Mount Everest is the largest peak in the world.",
    "K2 is known as the Savage Mountain.",
    "The Alps are famous for their breathtaking landscapes and winter sports.",
    "Mount Godwin-Austen , often called K2 , is a challenging peak to climb.",
    "Climbers from around the world gather to ascend Mount Everest each year.",
    "Mount Kilimanjaro is located in Tanzania and is Africa's highest peak.",
    "Mount Fuji is a sacred mountain in Japan and a popular tourist attraction.",
    "The Pyrenees form a natural border between Spain and France.",
    "Climbers dream of reaching the summit of Mount Elbrus, the highest peak in Europe.",
    "The Karakoram range, home to K2 , is renowned for its rugged terrain and high peaks.",
    "Aconcagua is the tallest mountain in South America and is part of the Andes."
]

In [14]:
# List of famous mountain names
mountain_names = [
    "Mount Everest", "K2", "The Alps", "Mount Godwin-Austen", "Savage Mountain", "Mount Kilimanjaro", 
    "Himalayas", "Denali", "Mount McKinley", "Andes", "Rocky Mountains", "Mount Fuji", 
    "Pyrenees", "Mount Elbrus", "Karakoram Range", "Aconcagua", "Karakoram"
]

In [15]:
def create_bio_tags(sentences, mountain_names):
    bio_dataset = []
    for sentence in sentences:
        words = sentence.split()  # Simple tokenization (by spaces)
        labels = ["O"] * len(words)  # Default label is "O" (Outside)

        # For each mountain name we check it in the text
        for mountain in mountain_names:
            mountain_tokens = mountain.split()  # Tokenizing the mountain name
            for i in range(len(words) - len(mountain_tokens) + 1):
                if words[i:i+len(mountain_tokens)] == mountain_tokens:
                    labels[i] = "B-MOUNTAIN"  # The first part of the mountain's name
                    for j in range(1, len(mountain_tokens)):
                        labels[i+j] = "I-MOUNTAIN"  # The rest of the mountain's name

        bio_dataset.append(list(zip(words, labels)))
    return bio_dataset

In [16]:
# Creating a dataset in BIN format
bio_dataset = create_bio_tags(sentences, mountain_names)

In [26]:
# Convert to pandas DataFrame
data = []
for sentence in bio_dataset:
    words, labels = zip(*sentence)
    data.append({"tokens": list(words), "tags": list(labels)})

df = pd.DataFrame(data)

In [27]:
# Converting tags to numeric values
tag_map = {"O": 0, "B-MOUNTAIN": 1, "I-MOUNTAIN": 2}
df["tags"] = df["tags"].apply(lambda tags: [tag_map[tag] for tag in tags])

In [10]:
# Splitting data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
# Сохранение данных
df.to_csv("processed_mountain_data.csv", index=False)  # Complete data set
train_data.to_csv("train_data.csv", index=False)       # Training sample
test_data.to_csv("test_data.csv", index=False)         # Test sample

print("Data has been saved to the following files:")
print("- Full dataset: processed_mountain_data.csv")
print("- Training set: train_data.csv")
print("- Test set: test_data.csv")

Данные сохранены в файлы:
- Полный набор данных: processed_mountain_data.csv
- Тренировочная выборка: train_data.csv
- Тестовая выборка: test_data.csv
