## Import requred libraries

In [1]:
import pandas as pd
import numpy as np
import ast

## Extract data from dataset
### https://www.kaggle.com/datasets/geraygench/mountain-ner-dataset/

In [3]:
mountains_data = pd.read_csv("mountain_dataset_with_markup.csv")

## Cleaning marker data

In [4]:
mountains_data["marker"] = mountains_data["marker"].apply(ast.literal_eval)
mountains_data["marker"] = mountains_data["marker"].apply(lambda x: np.nan if len(x)==0 else x)
mountains_data

Unnamed: 0,text,marker
0,A visit to a science museum for hands-on learn...,
1,Voice surface coach set democratic time year. ...,
2,Parent according maybe activity activity finis...,
3,A visit to a sculpture garden with intriguing ...,
4,The Julian Alps in Slovenia offer pristine lak...,"[(11, 15)]"
...,...,...
1579,They never audience meet. Appear region allow ...,
1580,Witnessing the mesmerizing Northern Lights dan...,"[(75, 97)]"
1581,Consumer join stage. Best likely center they p...,
1582,Hospital real school cover hotel over. Any tra...,


## Preparing labels

In [5]:
def create_labels(row):
    text = row["text"].split()
    labels = ["O"] * len(text)
    words = []
    if isinstance(row["marker"], list):
        for marker in row["marker"]:
            words.extend(row["text"][marker[0]:marker[1]].split())
    
        words = list(set(words))
        first = True
        for index, word in enumerate(text):
            if word in words:
                if first:
                    labels[index] = "B-geo"
                    first = False
                else:
                    labels[index] = "I-geo"
    
    return " ".join(labels)


mountains_data["labels"] = mountains_data.apply(create_labels, axis=1)
    
mountains_data

Unnamed: 0,text,marker,labels
0,A visit to a science museum for hands-on learn...,,O O O O O O O O O
1,Voice surface coach set democratic time year. ...,,O O O O O O O O O O O O O O O O O O
2,Parent according maybe activity activity finis...,,O O O O O O O O O O O O O O O O
3,A visit to a sculpture garden with intriguing ...,,O O O O O O O O O
4,The Julian Alps in Slovenia offer pristine lak...,"[(11, 15)]",O O B-geo O O O O O O O O
...,...,...,...
1579,They never audience meet. Appear region allow ...,,O O O O O O O O O O O
1580,Witnessing the mesmerizing Northern Lights dan...,"[(75, 97)]",O O O O O O O O O O O B-geo I-geo O O
1581,Consumer join stage. Best likely center they p...,,O O O O O O O O O O O O O O O O O
1582,Hospital real school cover hotel over. Any tra...,,O O O O O O O O O O O


## Saving dataset

In [7]:
mountains_data.to_csv("labeled_mountains_dataset.csv", index=False)