In [11]:
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""IWSLT 2017 dataset """


import os

import datasets


_HOMEPAGE = "https://sites.google.com/site/iwsltevaluation2017/TED-tasks"

_DESCRIPTION = """\
The IWSLT 2017 Multilingual Task addresses text translation, including zero-shot translation, with a single MT system across all directions including English, German, Dutch, Italian and Romanian. As unofficial task, conventional bilingual text translation is offered between English and Arabic, French, Japanese, Chinese, German and Korean.
"""

_CITATION = """\
@inproceedings{cettolo-etal-2017-overview,
    title = "Overview of the {IWSLT} 2017 Evaluation Campaign",
    author = {Cettolo, Mauro  and
      Federico, Marcello  and
      Bentivogli, Luisa  and
      Niehues, Jan  and
      St{\\"u}ker, Sebastian  and
      Sudoh, Katsuhito  and
      Yoshino, Koichiro  and
      Federmann, Christian},
    booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
    month = dec # " 14-15",
    year = "2017",
    address = "Tokyo, Japan",
    publisher = "International Workshop on Spoken Language Translation",
    url = "https://aclanthology.org/2017.iwslt-1.1",
    pages = "2--14",
}
"""

REPO_URL = "https://huggingface.co/datasets/iwslt2017/resolve/main/"
MULTI_URL = REPO_URL + "data/2017-01-trnmted/texts/DeEnItNlRo/DeEnItNlRo/DeEnItNlRo-DeEnItNlRo.zip"
BI_URL = REPO_URL + "data/2017-01-trnted/texts/{source}/{target}/{source}-{target}.zip"


class IWSLT2017Config(datasets.BuilderConfig):
    """BuilderConfig for NewDataset"""

    def __init__(self, pair, is_multilingual, **kwargs):
        """

        Args:
            pair: the language pair to consider
            is_multilingual: Is this pair in the multilingual dataset (download source is different)
            **kwargs: keyword arguments forwarded to super.
        """
        self.pair = pair
        self.is_multilingual = is_multilingual
        super().__init__(**kwargs)


# XXX: Artificially removed DE from here, as it also exists within bilingual data
MULTI_LANGUAGES = ["en", "it", "nl", "ro"]
BI_LANGUAGES = ["ar", "de", "en", "fr", "ja", "ko", "zh"]
MULTI_PAIRS = [f"{source}-{target}" for source in MULTI_LANGUAGES for target in MULTI_LANGUAGES if source != target]
BI_PAIRS = [
    f"{source}-{target}"
    for source in BI_LANGUAGES
    for target in BI_LANGUAGES
    if source != target and (source == "en" or target == "en")
]

PAIRS = MULTI_PAIRS + BI_PAIRS


class IWSLT217(datasets.GeneratorBasedBuilder):
    """The IWSLT 2017 Evaluation Campaign includes a multilingual TED Talks MT task."""

    VERSION = datasets.Version("1.0.0")

    # This is an example of a dataset with multiple configurations.
    # If you don't want/need to define several sub-sets in your dataset,
    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
    BUILDER_CONFIG_CLASS = IWSLT2017Config
    BUILDER_CONFIGS = [
        IWSLT2017Config(
            name="iwslt2017-" + pair,
            description="A small dataset",
            version=datasets.Version("1.0.0"),
            pair=pair,
            is_multilingual=pair in MULTI_PAIRS,
        )
        for pair in PAIRS
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {"translation": datasets.features.Translation(languages=self.config.pair.split("-"))}
            ),
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        source, target = self.config.pair.split("-")
        if self.config.is_multilingual:
            dl_dir = dl_manager.download_and_extract(MULTI_URL)
            data_dir = os.path.join(dl_dir, "DeEnItNlRo-DeEnItNlRo")
            years = [2010]
        else:
            bi_url = BI_URL.format(source=source, target=target)
            dl_dir = dl_manager.download_and_extract(bi_url)
            data_dir = os.path.join(dl_dir, f"{source}-{target}")
            years = [2010, 2011, 2012, 2013, 2014, 2015]
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "source_files": [
                        os.path.join(
                            data_dir,
                            f"train.tags.{self.config.pair}.{source}",
                        )
                    ],
                    "target_files": [
                        os.path.join(
                            data_dir,
                            f"train.tags.{self.config.pair}.{target}",
                        )
                    ],
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "source_files": [
                        os.path.join(
                            data_dir,
                            f"IWSLT17.TED.tst{year}.{self.config.pair}.{source}.xml",
                        )
                        for year in years
                    ],
                    "target_files": [
                        os.path.join(
                            data_dir,
                            f"IWSLT17.TED.tst{year}.{self.config.pair}.{target}.xml",
                        )
                        for year in years
                    ],
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "source_files": [
                        os.path.join(
                            data_dir,
                            f"IWSLT17.TED.dev2010.{self.config.pair}.{source}.xml",
                        )
                    ],
                    "target_files": [
                        os.path.join(
                            data_dir,
                            f"IWSLT17.TED.dev2010.{self.config.pair}.{target}.xml",
                        )
                    ],
                },
            ),
        ]

    def _generate_examples(self, source_files, target_files):
        """Yields examples."""
        id_ = 0
        source, target = self.config.pair.split("-")
        for source_file, target_file in zip(source_files, target_files):
            with open(source_file, "r", encoding="utf-8") as sf:
                with open(target_file, "r", encoding="utf-8") as tf:
                    for source_row, target_row in zip(sf, tf):
                        source_row = source_row.strip()
                        target_row = target_row.strip()

                        if source_row.startswith("<"):
                            if source_row.startswith("<seg"):
                                # Remove <seg id="1">.....</seg>
                                # Very simple code instead of regex or xml parsing
                                part1 = source_row.split(">")[1]
                                source_row = part1.split("<")[0]
                                part1 = target_row.split(">")[1]
                                target_row = part1.split("<")[0]

                                source_row = source_row.strip()
                                target_row = target_row.strip()
                            else:
                                continue

                        yield id_, {"translation": {source: source_row, target: target_row}}
                        id_ += 1

ModuleNotFoundError: No module named 'datasets'

In [8]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-en-zh", trust_remote_code=True)

num_examples = 100
for i in range(num_examples):
    example = dataset["train"][i]
    print(f"English: {example['translation']['en']}")
    print(f"Chinese: {example['translation']['zh']}")
    print("-" * 50)

csv_filename = "en-zh.csv"
df_train = pd.DataFrame(dataset["train"]["translation"])
df_train.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"Saved dataset to {csv_filename}")

ModuleNotFoundError: No module named 'datasets'

In [9]:
import pandas as pd

# open txt file in ./WiC_dataset/train/train.data.txt
with open("./WiC_dataset/train/train.data.txt", "r") as f:
    content = f.read()

with open("./WiC_dataset/train/train.gold.txt", "r") as f:
    labels = [line.strip() for line in f.readlines()]

data = [line.split("\t") for line in content.split("\n") if line]

df = pd.DataFrame(data, columns=["word", "pos", "index", "sentence1", "sentence2"])

print(len(df), len(labels))
df = df.rename(columns=lambda x: x.strip()) 
df["label"] = labels

print(df.head())



5428 5428
      word pos index                                       sentence1  \
0    carry   V   2-1              You must carry your camping gear .   
1       go   V   2-6  Messages must go through diplomatic channels .   
2    break   V   0-2                                Break an alibi .   
3      cup   N   8-4         He wore a jock strap with a metal cup .   
4  academy   N   1-2                          The Academy of Music .   

                                           sentence2 label  
0                    Sound carries well over water .     F  
1   Do you think the sofa will go through the door ?     F  
2  The wholesaler broke the container loads into ...     F  
3            Bees filled the waxen cups with honey .     T  
4                               The French Academy .     F  


In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import spacy

# Load the data (you've already done this)
# ... existing code ...

# Convert labels from 'T'/'F' to 1/0
df['label'] = df['label'].map({'T': 1, 'F': 0})

# Feature engineering
# Let's create features based on the context of the target word

# Load spaCy model for NLP processing
nlp = spacy.load('en_core_web_sm')

def extract_features(row):
    word = row['word']
    pos = row['pos']
    
    # Parse the indices
    indices = row['index'].split('-')
    idx1 = int(indices[0])
    idx2 = int(indices[1])
    
    # Tokenize sentences (simple whitespace tokenization to match dataset)
    tokens1 = row['sentence1'].split()
    tokens2 = row['sentence2'].split()
    
    # Verify the target word is at the specified position
    target1 = tokens1[idx1].lower()
    target2 = tokens2[idx2].lower()
    
    # Get context (3 words before and after, respecting sentence boundaries)
    start1 = max(0, idx1 - 3)
    end1 = min(len(tokens1), idx1 + 4)
    context1 = ' '.join(tokens1[start1:idx1] + tokens1[idx1+1:end1])
    
    start2 = max(0, idx2 - 3)
    end2 = min(len(tokens2), idx2 + 4)
    context2 = ' '.join(tokens2[start2:idx2] + tokens2[idx2+1:end2])
    
    # Combine features
    features = {
        'word': word,
        'pos': pos,
        'target1': target1,
        'target2': target2,
        'context1': context1,
        'context2': context2,
        'context_combined': context1 + ' ' + context2
    }
    
    return features

# Apply feature extraction
features_list = df.apply(extract_features, axis=1).tolist()
features_df = pd.DataFrame(features_list)

# Vectorize the context features
vectorizer = TfidfVectorizer(max_features=1000)
context_vectors = vectorizer.fit_transform(features_df['context_combined'])

# Add word and POS as one-hot encoded features
word_vectorizer = CountVectorizer()
word_vectors = word_vectorizer.fit_transform(features_df['word'])

pos_vectorizer = CountVectorizer()
pos_vectors = pos_vectorizer.fit_transform(features_df['pos'])

# Combine all features
from scipy.sparse import hstack
X = hstack([context_vectors, word_vectors, pos_vectors])
y = df['label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

# Example of making predictions on new data
def predict_word_sense(word, pos, sentence1, sentence2, idx1, idx2):
    row = pd.Series({
        'word': word,
        'pos': pos,
        'index': f"{idx1}-{idx2}",
        'sentence1': sentence1,
        'sentence2': sentence2
    })
    
    features = extract_features(row)
    features_df = pd.DataFrame([features])
    
    context_vec = vectorizer.transform([features['context_combined']])
    word_vec = word_vectorizer.transform([features['word']])
    pos_vec = pos_vectorizer.transform([features['pos']])
    
    X_new = hstack([context_vec, word_vec, pos_vec])
    
    prediction = clf.predict(X_new)[0]
    probability = clf.predict_proba(X_new)[0][1]
    
    return {
        'same_sense': bool(prediction),
        'confidence': probability
    }

# Example usage
result = predict_word_sense('bank', 'NOUN', 'I went to the bank to deposit money.', 
                           'The river bank was muddy after the rain.', 5, 2)
print(result)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [12]:
import csv

input_file = 'en-zh.csv'
english_output = 'src-train.txt'
chinese_output = 'tgt-train.txt'

with open(input_file, 'r', encoding='utf-8') as csv_file, \
     open(english_output, 'w', encoding='utf-8') as eng_file, \
     open(chinese_output, 'w', encoding='utf-8') as zh_file:
    
    csv_reader = csv.reader(csv_file)
    
    for row in csv_reader:
        if len(row) >= 2:
            english_text = row[0].strip('"')
            chinese_text = row[1].strip('"')
            
            # Write to respective files
            eng_file.write(english_text + '\n')
            zh_file.write(chinese_text + '\n')

print(f"Successfully split {input_file} into {english_output} and {chinese_output}")

Successfully split en-zh.csv into src-train.txt and tgt-train.txt
