<a href="https://colab.research.google.com/github/Dona134/Metaphor-Classification-NLP/blob/main/ru_large_token.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import json
import re
import numpy as np
from pathlib import Path
from typing import Dict, List

PROJECT_ROOT = Path("/content/drive/MyDrive/Metaphor-Classification-NLP")
XML_PATH = PROJECT_ROOT / "data" / "raw" / "ru_large.xml"
DATA_PROC = PROJECT_ROOT / "data" / "processed"

print(f"Parsing {XML_PATH}")
tree = ET.parse(XML_PATH)
root = tree.getroot()

print("✅ Parsed XML tree")


Parsing /content/drive/MyDrive/Metaphor-Classification-NLP/data/raw/ru_large.xml
✅ Parsed XML tree


In [13]:
def parse_instance(elem):
    # Get FULL Current text (handles nested tags)
    current_elem = elem.find('.//Current')
    current_full = ET.tostring(current_elem, encoding='unicode') if current_elem is not None else ''

    rec = {
        'id': elem.get('id'),
        'docid': elem.get('docid'),
        'current_raw': current_full,  # With tags for source extraction
        'current': ''  # Will clean later
    }

    # Score
    scores = [float(ann.get('score', 0))
              for ann in elem.findall('.//MetaphoricityAnnotation')]
    rec['met_score'] = np.mean(scores) if scores else 0.0

    return rec

instances = [parse_instance(inst) for inst in root.findall('LmInstance')]
df_ru_raw = pd.DataFrame(instances)
print(f"✅ {len(df_ru_raw)} instances")
print("Sample raw:")
print(df_ru_raw['current_raw'].iloc[0][:200] + "...")


✅ 64019 instances
Sample raw:
<Current>Израильское <LmTarget>лобби</LmTarget> в США <LmSource>сильно</LmSource>, при этом Иран действительно напрямую угрожает еврейскому государству.</Current>
      ...


In [16]:
def clean_current(raw):
    """Remove tags → clean text"""
    return re.sub(r'<[^>]+>', '', raw).strip()

def get_source_words(raw):
    """Extract <LmSource> content"""
    match = re.search(r'<LmSource>(.*?)</LmSource>', raw, re.DOTALL)
    return match.group(1).split() if match else []

def assign_token_labels(words, source_words, instance_label):
    """Copy instance label to source tokens only"""
    return [instance_label if w in source_words else 0 for w in words]

# Fix 1: Strip trailing commas from source_words
def clean_source_words(words):
    """Remove trailing commas"""
    return [w.rstrip(',.') if isinstance(w, str) else w for w in words]


df_ru = df_ru_raw.copy()
df_ru['current'] = df_ru['current_raw'].apply(clean_current)
df_ru['words'] = df_ru['current'].str.split()
df_ru['source_words'] = df_ru['current_raw'].apply(get_source_words)
df_ru['words']= df_ru['words'].apply(clean_source_words)


# Instance label first (met_score >= 0.5)
df_ru['label'] = (df_ru['met_score'] >= 0.5).astype(int)

# Token labels: copy instance label to source tokens
df_ru['token_labels'] = df_ru.apply(
    lambda r: assign_token_labels(r['words'], r['source_words'], r['label']),
    axis=1
)

df_ru['document_name'] = 'ru_large-' + df_ru['docid'].astype(str)

print(df_ru[['id', 'label', 'current', 'source_words', 'token_labels']].head())




    id  label                                            current  \
0   60      1  Израильское лобби в США сильно, при этом Иран ...   
1  296      0  Неофициально один из украинских чиновников поя...   
2  333      1       Раскол серьезный, но при этом раскол разный.   
3  340      0  Мы очень часто и много говорим об американском...   
4  451      0  «Никаких революций или горбачевских приступов ...   

     source_words                                       token_labels  
0        [сильно]            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]  
1  [Неофициально]  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2     [серьезный]                              [0, 1, 0, 0, 0, 0, 0]  
3       [тянутся]  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4         [будет]                           [0, 0, 0, 0, 0, 0, 0, 0]  


link to the dataset source: https://github.com/lcc-api/metaphor/tree/main

In [18]:
# keep only document_name	words	token_labels	met_score
df_ru = df_ru[['document_name', 'words', 'token_labels', 'met_score']]

In [19]:
df_ru.head()

Unnamed: 0,document_name,words,token_labels,met_score
0,ru_large-4,"[Израильское, лобби, в, США, сильно, при, этом...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",2.0
1,ru_large-16,"[Неофициально, один, из, украинских, чиновнико...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-1.0
2,ru_large-18,"[Раскол, серьезный, но, при, этом, раскол, раз...","[0, 1, 0, 0, 0, 0, 0]",1.0
3,ru_large-18,"[Мы, очень, часто, и, много, говорим, об, амер...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
4,ru_large-27,"[«Никаких, революций, или, горбачевских, прист...","[0, 0, 0, 0, 0, 0, 0, 0]",0.0


In [20]:
# Save VUAMC format
DATA_PROC.mkdir(parents=True, exist_ok=True)
OUT_PATH = DATA_PROC / "ru_large_token.jsonl"

with OUT_PATH.open("w", encoding="utf-8") as f:
    for _, row in df_ru.iterrows():
        rec = {
            "document_name": row["document_name"],
            "words": row["words"],
            "labels": row["token_labels"],
            "met_score": row["met_score"]
        }
        f.write(json.dumps(rec) + "\n")

print(f"✅ Saved {len(df_ru)} → {OUT_PATH}")

✅ Saved 64019 → /content/drive/MyDrive/Metaphor-Classification-NLP/data/processed/ru_large_token.jsonl
