In [None]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import pandas as pd
import numpy as np
import json

PROJECT_ROOT = Path("/content/drive/MyDrive/Metaphor-Classification-NLP")
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROC = PROJECT_ROOT / "data" / "processed"

RAW_VUAMC = DATA_RAW / "vuamc.parquet"
OUT_JSONL = DATA_PROC / "vuamc_token.jsonl"  # ← Direct to processed/

DATA_PROC.mkdir(parents=True, exist_ok=True)  # ← Creates data/processed/
print(f"Saving directly to Drive: {OUT_JSONL.absolute()}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saving directly to Drive: /content/drive/MyDrive/Metaphor-Classification-NLP/data/processed/vuamc_token.jsonl


In [None]:
raw_df = pd.read_parquet(RAW_VUAMC)
raw_df.head()


Unnamed: 0,document_name,words,pos_tags,met_type,meta
0,a1e-fragment01,"[Latest, corporate, unbundler, reveals, laid-b...","[AJS, AJ0, NN1, VVZ, AJ0, NN1, PUN, NP0, NP0, ...","[{'type': 'mrw/met', 'word_indices': [3]}, {'t...","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."
1,a1e-fragment01,"[By, FRANK, KANE]","[PRP, NP0, NP0-NN1]",[],"[N/A, N/A, N/A]"
2,a1e-fragment01,"[IT, SEEMS, that, Roland, Franklin, ,, the, la...","[PNP, VVZ, CJT, NP0, NP0, PUN, AT0, AJS, NN1, ...","[{'type': 'mrw/met', 'word_indices': [16]}, {'...","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."
3,a1e-fragment01,"[He, has, not, properly, investigated, the, ta...","[PNP, VHZ, XX0, AV0, VVN, AT0, NN1, POS, NN1, ...","[{'type': 'mrw/met', 'word_indices': [6]}]","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."
4,a1e-fragment01,"[The, 63-year-old, head, of, Pembridge, Invest...","[AT0, AJ0, NN1, PRF, NP0, NN2, PUN, PRP, DTQ, ...","[{'type': 'mrw/met', 'word_indices': [2]}, {'t...","[N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, ..."


In [None]:
raw_df = raw_df.drop(columns='meta')  # Drop unnecessary column

## Filtering for metaphor types

In [None]:
# display the distribution of metaphor types (frequency of occurence in all the cells in met_type column)
met_type = []
for row in raw_df['met_type']:
    for t in row:
        met_type.append(t['type'])
met_type_counts = pd.Series(met_type).value_counts()
print(met_type_counts)

mrw/met                21887
mrw/met/WIDLII          1825
mrw/met/PP              1099
mrw/lit                  337
mrw/met/double           211
mFlag/lex                102
mFlag/phrase              26
mrw/impl                  25
mrw/bridge                22
mrw/met/OMM               17
mrw/lit/WIDLII            16
mFlag/morph               12
mrw/met/M                  7
mFlag/lex/WIDLII           5
mrw/met/ANIM               2
mrw/impl/WIDLII            1
mrw/met/UNKNOWN            1
mFlag/phrase/WIDLII        1
Name: count, dtype: int64


Full dataset description can be found here: http://www.vismet.org/metcor/manual/index.php

In [None]:
target_types = {'mrw/met', 'mrw/lit', 'mrw/met/double', 'mrw/met/PP'}

df_filtered = raw_df[raw_df["met_type"].apply(
    lambda items: any(item["type"] in target_types for item in items)
)]

In [None]:
met_map = {
    'mrw/met': 'Indirect',
    'mrw/lit': 'Direct',
    'mrw/met/double': 'Double',
    'mrw/met/PP': 'Personification'
}

# The set of keys from met_map allows for efficient lookup of metaphorical types.
metaphorical_types = set(met_map.keys())

def simple_pos_map(pos_tag):
    """Maps a detailed POS tag to a simplified category (verb, noun, adj, adv, nan)."""
    if pos_tag.startswith("V"):
        return "verb"
    elif pos_tag.startswith("N"):
        return "noun"
    elif pos_tag.startswith("AJ"):
        return "adj"
    elif pos_tag.startswith("AV"):
        return "adv"
    else:
        return "nan"

def process_row(row_data):
    """Processes a single row to extract labels, metaphor types, and simplified POS tags."""
    num_tokens = len(row_data["pos_tags"])

    # Initialize lists for metaphor types and labels for each token
    token_metaphor_types = ["literal"] * num_tokens
    token_labels = [0] * num_tokens

    # Iterate through each metaphor annotation in the row
    for annotation in row_data["met_type"]:
        met_type_category = annotation["type"]
        mapped_met_type = met_map.get(met_type_category, "literal")
        is_truly_metaphorical = met_type_category in metaphorical_types

        # Apply the metaphor type and label to the specified word indices
        for idx in annotation["word_indices"]:
            # Ensure idx is within bounds to prevent IndexError if data is malformed
            if 0 <= idx < num_tokens:
                token_metaphor_types[idx] = mapped_met_type
                if is_truly_metaphorical:
                    token_labels[idx] = 1
            # else: Optionally log a warning for out-of-bounds index

    # Generate simplified POS tags for all tokens
    simplified_pos_tags = [
        simple_pos_map(tag) for tag in row_data["pos_tags"]
    ]

    return pd.Series([token_labels, token_metaphor_types, simplified_pos_tags])

df_clean = df_filtered.copy()

# Apply the processing function to each row of the filtered DataFrame
df_clean[["labels", "metaphor_type", "pos"]] = df_filtered.apply(process_row, axis=1)

# Drop unnecessary columns 'met_type' and 'pos_tags' as they have been processed
df_clean = df_clean.drop(columns=['met_type', 'pos_tags'])

In [None]:
df_clean.head()

Unnamed: 0,document_name,words,labels,metaphor_type,pos
0,a1e-fragment01,"[Latest, corporate, unbundler, reveals, laid-b...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[literal, literal, literal, Indirect, literal,...","[adj, adj, noun, verb, adj, noun, nan, noun, n..."
2,a1e-fragment01,"[IT, SEEMS, that, Roland, Franklin, ,, the, la...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[literal, literal, literal, literal, literal, ...","[nan, verb, nan, noun, noun, nan, nan, adj, no..."
3,a1e-fragment01,"[He, has, not, properly, investigated, the, ta...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[literal, literal, literal, literal, literal, ...","[nan, verb, nan, adv, verb, nan, noun, nan, no..."
4,a1e-fragment01,"[The, 63-year-old, head, of, Pembridge, Invest...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","[literal, literal, Indirect, literal, literal,...","[nan, adj, noun, nan, noun, noun, nan, nan, na..."
5,a1e-fragment01,"[If, he, had, taken, his, own, rule, seriously...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[literal, literal, literal, Indirect, literal,...","[nan, nan, verb, verb, nan, nan, noun, adv, na..."


In [None]:
# average sentence length
print(f"Average sentence length: {df_filtered['words'].apply(len).mean():.2f} tokens")
# max and min
print(f"Max sentence length: {df_filtered['words'].apply(len).max()} tokens")
print(f"Min sentence length: {df_filtered['words'].apply(len).min()} tokens")

Average sentence length: 21.57 tokens
Max sentence length: 127 tokens
Min sentence length: 1 tokens


In [None]:
with OUT_JSONL.open("w", encoding="utf-8") as f:
    for _, row in df_clean.iterrows():
        rec = {
            "document_name": row["document_name"],
            "words": row["words"].tolist(),  # Convert numpy array to list
            "labels": row["labels"],
            "metaphor_type": row["metaphor_type"],
            "pos": row["pos"],
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Saved processed VUAMC to", OUT_JSONL)

Saved processed VUAMC to /content/drive/MyDrive/Metaphor-Classification-NLP/data/processed/vuamc_token.jsonl
