In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
!pip install spacy
!python -m spacy download ru_core_news_lg

Collecting ru-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.8.0/ru_core_news_lg-3.8.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [28]:
import xml.etree.ElementTree as ET
import pandas as pd
import json
import re
import numpy as np
from pathlib import Path
from typing import Dict, List

PROJECT_ROOT = Path("/content/drive/MyDrive/Metaphor-Classification-NLP")
XML_PATH = PROJECT_ROOT / "data" / "raw" / "ru_large.xml"
DATA_PROC = PROJECT_ROOT / "data" / "processed"

print(f"Parsing {XML_PATH}")
tree = ET.parse(XML_PATH)
root = tree.getroot()

print(" Parsed XML tree")


Parsing /content/drive/MyDrive/Metaphor-Classification-NLP/data/raw/ru_large.xml
 Parsed XML tree


In [29]:
def parse_instance(elem):
    # Get FULL Current text (handles nested tags)
    current_elem = elem.find('.//Current')
    current_full = ET.tostring(current_elem, encoding='unicode') if current_elem is not None else ''

    rec = {
        'id': elem.get('id'),
        'docid': elem.get('docid'),
        'current_raw': current_full,  # With tags for source extraction
        'current': ''  # Will clean later
    }

    # Score
    scores = [float(ann.get('score', 0))
              for ann in elem.findall('.//MetaphoricityAnnotation')]
    rec['met_score'] = np.mean(scores) if scores else 0.0

    return rec

instances = [parse_instance(inst) for inst in root.findall('LmInstance')]
df_ru_raw = pd.DataFrame(instances)
print(f" {len(df_ru_raw)} instances")
print("Sample raw:")
print(df_ru_raw['current_raw'].iloc[0][:200] + "...")


✅ 64019 instances
Sample raw:
<Current>Израильское <LmTarget>лобби</LmTarget> в США <LmSource>сильно</LmSource>, при этом Иран действительно напрямую угрожает еврейскому государству.</Current>
      ...


In [30]:
def clean_current(raw):
    """Remove tags → clean text"""
    return re.sub(r'<[^>]+>', '', raw).strip()

def get_source_words(raw):
    """Extract <LmSource> content"""
    match = re.search(r'<LmSource>(.*?)</LmSource>', raw, re.DOTALL)
    return match.group(1).split() if match else []

def assign_token_labels(words, source_words, instance_label):
    """Copy instance label to source tokens only"""
    return [instance_label if w in source_words else 0 for w in words]

# Fix 1: Strip trailing commas from source_words
def clean_source_words(words):
    """Remove trailing commas"""
    return [w.rstrip(',.') if isinstance(w, str) else w for w in words]


df_ru = df_ru_raw.copy()
df_ru['current'] = df_ru['current_raw'].apply(clean_current)
df_ru['words'] = df_ru['current'].str.split()
df_ru['source_words'] = df_ru['current_raw'].apply(get_source_words)
df_ru['words']= df_ru['words'].apply(clean_source_words)


# Instance label first (met_score >= 0.5)
df_ru['label'] = (df_ru['met_score'] >= 0.5).astype(int)

# Token labels: copy instance label to source tokens
df_ru['token_labels'] = df_ru.apply(
    lambda r: assign_token_labels(r['words'], r['source_words'], r['label']),
    axis=1
)

df_ru['document_name'] = 'ru_large-' + df_ru['docid'].astype(str)

print(df_ru[['id', 'label', 'current', 'source_words', 'token_labels']].head())




    id  label                                            current  \
0   60      1  Израильское лобби в США сильно, при этом Иран ...   
1  296      0  Неофициально один из украинских чиновников поя...   
2  333      1       Раскол серьезный, но при этом раскол разный.   
3  340      0  Мы очень часто и много говорим об американском...   
4  451      0  «Никаких революций или горбачевских приступов ...   

     source_words                                       token_labels  
0        [сильно]            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]  
1  [Неофициально]  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2     [серьезный]                              [0, 1, 0, 0, 0, 0, 0]  
3       [тянутся]  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4         [будет]                           [0, 0, 0, 0, 0, 0, 0, 0]  


link to the dataset source: https://github.com/lcc-api/metaphor/tree/main

In [31]:
# keep only document_name	words	token_labels	met_score
df_ru = df_ru[['document_name', 'words', 'token_labels', 'met_score']]

In [32]:
# Filter df_ru to keep only rows where 'labels' contains at least one '1'
df_ru = df_ru[df_ru['token_labels'].apply(lambda x: 1 in x)]

In [33]:
df_ru.shape

(11878, 4)

In [34]:
df_ru.head()

Unnamed: 0,document_name,words,token_labels,met_score
0,ru_large-4,"[Израильское, лобби, в, США, сильно, при, этом...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",2.0
2,ru_large-18,"[Раскол, серьезный, но, при, этом, раскол, раз...","[0, 1, 0, 0, 0, 0, 0]",1.0
7,ru_large-48,"[В, дальнейшем, в, республике, будет, осуществ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0
8,ru_large-8544,"[Называть, так, органы, исполнительной, власти...","[0, 0, 1, 0, 0, 0, 0, 0, 0]",1.0
10,ru_large-8558,"[а, дл, начала, сменить, воровскую, власть, на...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",2.0


In [37]:
# the lists in "words" should leave with a word not a punctuation
def filter_words_and_labels(row):
    filtered_words = []
    filtered_labels = []
    for i, word in enumerate(row["words"]):
        if word.isalpha():
            filtered_words.append(word)
            filtered_labels.append(row["token_labels"][i])
    return filtered_words, filtered_labels

# Apply the function and update both columns
df_ru[['words', 'token_labels']] = df_ru.apply(filter_words_and_labels, axis=1, result_type='expand')

In [40]:
# check if the lengths match in words and token_labels
df_ru['lengths_match'] = df_ru.apply(lambda row: len(row['words']) == len(row['token_labels']), axis=1)
print(f"All 'words' and 'token_labels' lengths match: {df_ru['lengths_match'].all()}")
df_ru.drop(columns=['lengths_match'], inplace=True, errors='ignore')

All 'words' and 'token_labels' lengths match: True


In [39]:
df_ru.head()

Unnamed: 0,document_name,words,token_labels,met_score
0,ru_large-4,"[Израильское, лобби, в, США, сильно, при, этом...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",2.0
2,ru_large-18,"[Раскол, серьезный, но, при, этом, раскол, раз...","[0, 1, 0, 0, 0, 0, 0]",1.0
7,ru_large-48,"[В, дальнейшем, в, республике, будет, осуществ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0
8,ru_large-8544,"[Называть, так, органы, исполнительной, власти...","[0, 0, 1, 0, 0, 0, 0, 0, 0]",1.0
10,ru_large-8558,"[а, дл, начала, сменить, воровскую, власть, на...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",2.0


# POS tagging

In [42]:
import spacy
nlp = spacy.load("ru_core_news_lg")

In [43]:
def get_pos(tokens):
    doc = nlp(" ".join(tokens))
    return [token.pos_ for token in doc]


In [44]:
df_ru["pos"] = df_ru["words"].apply(get_pos)

In [45]:
POS_MAP = {
    "VERB": "verb",
    "AUX": "verb",
    "NOUN": "noun",
    "PROPN": "noun",
    "ADJ": "adj",
    "ADV": "adv"
}
df_ru.loc[:, "pos_coarse"] = df_ru["pos"].apply(
    lambda tags: [POS_MAP.get(tag, np.nan) for tag in tags]
)

In [46]:
# drop the pos column
df_ru.drop(columns=["pos"], inplace=True)
# rename "pos_coarse" to "pos"
df_ru.rename(columns={"pos_coarse": "pos"}, inplace=True)

In [None]:
df_ru.head()

Unnamed: 0,document_name,words,token_labels,met_score,pos
0,ru_large-4,"[Израильское, лобби, в, США, сильно, при, этом...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",2.0,"[adj, noun, nan, noun, adv, nan, nan, noun, ad..."
2,ru_large-18,"[Раскол, серьезный, но, при, этом, раскол, раз...","[0, 1, 0, 0, 0, 0, 0]",1.0,"[noun, adj, nan, nan, nan, noun, adj]"
7,ru_large-48,"[В, дальнейшем, в, республике, будет, осуществ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0,"[nan, adj, nan, noun, verb, verb, adj, noun, a..."
8,ru_large-8544,"[Называть, так, органы, исполнительной, власти...","[0, 0, 1, 0, 0, 0, 0, 0, 0]",1.0,"[verb, adv, noun, adj, noun, verb, nan, adj, n..."
10,ru_large-8558,"[а, дл, начала, сменить, воровскую, власть, на...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",2.0,"[nan, noun, verb, verb, adj, noun, nan, adv, n..."


# Save the final dataset

In [48]:
# Save VUAMC format
DATA_PROC.mkdir(parents=True, exist_ok=True)
OUT_PATH = DATA_PROC / "ru_large_token.jsonl"

with OUT_PATH.open("w", encoding="utf-8") as f:
    for _, row in df_ru.iterrows():
        rec = {
            "document_name": row["document_name"],
            "words": row["words"],
            "labels": row["token_labels"],
            "pos": row["pos"],
            "met_score": row["met_score"]
        }
        f.write(json.dumps(rec) + "\n")

print(f" Saved {len(df_ru)} → {OUT_PATH}")

 Saved 11878 → /content/drive/MyDrive/Metaphor-Classification-NLP/data/processed/ru_large_token.jsonl
