In [1]:
import pandas as pd

## Load the Italian-Dutch parallel corpus

In [2]:
columns = ['it_id', 'italian', 'nl_id', 'dutch']
df = pd.read_csv('Datasets/Tatoeba-it-nl.tsv', sep='\t', header=None)
df.columns = columns
df.head()

Unnamed: 0,it_id,italian,nl_id,dutch
0,4369,Devo andare a dormire.,5966,Ik moet gaan slapen.
1,4371,Che cos'è?,5970,Wat is dat?
2,4373,"La parola d'accesso è ""Muiriel"".",5985,"Het wachtwoord is ""Muiriel""."
3,4375,Non cambierà niente.,379556,Dat zal niets aan de zaak veranderen.
4,4376,Costerà trenta euro.,378907,"Dat zal € 30,- kosten."


## Define function to count the phenomena frequencies in the dataset

In [3]:
import stanza

# Load the Italian language model in stanza
stanza.download('it')
nlp = stanza.Pipeline(lang='it', processors='tokenize,mwt,pos,lemma')

# Helper functions for phenomena detection
def has_subject_omission(sentence):
    """Check if the first word of the sentence is a verb."""
    if not isinstance(sentence, str):
        return False
    doc = nlp(sentence)
    first_word_pos = doc.sentences[0].words[0].upos if doc.sentences and doc.sentences[0].words else None
    return first_word_pos == "VERB" or first_word_pos == "AUX"

def has_reflexive_construction(sentence):
    """Check if the sentence contains reflexive pronouns."""
    reflexive_pronouns = ["mi", "ti", "si", "ci", "vi"]
    return any(pronoun in sentence.split() for pronoun in reflexive_pronouns)

def has_double_negation(sentence):
    """Check if the sentence contains double negation."""
    negative_words = ["niente", "nessuno", "nulla"]
    return "non" in sentence.split() and any(word in sentence.split() for word in negative_words)

def has_diminutives_or_augmentatives(sentence):
    """Check if the sentence contains diminutives or augmentatives."""
    suffixes = ["ino", "etto", "accio", "one"]
    return any(word.endswith(suffix) for suffix in suffixes for word in sentence.split())

def has_clitic_pronouns(sentence):
    """Check if the sentence contains clitic pronouns."""
    clitic_pronouns = ["lo", "la", "li", "gli", "le", "ne", "ci", "mi", "ti",
    "glielo", "gliela", "glieli", "gliele"]
    return any(pronoun in sentence.split() for pronoun in clitic_pronouns)

def phenomena_counter(df):
  phenomena_counts = {
    "Rows": len(df),
    "Sentences": len(df['it_id'].unique()),
    "Subject Omission": df['subject_omission'].sum(),
    "Reflexive Construction": df['reflexive_construction'].sum(),
    "Double Negation": df['double_negation'].sum(),
    "Diminutives and Augmentatives": df['diminutives_augmentatives'].sum(),
    "Clitic Pronouns": df['clitic_pronouns'].sum(),
    "Other": len(df[~(df[['subject_omission', 'reflexive_construction', 'double_negation',
                          'diminutives_augmentatives', 'clitic_pronouns']].any(axis=1))])
  }

  return phenomena_counts

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-20 14:15:58 INFO: Downloaded file to C:\Users\fabia\stanza_resources\resources.json
2024-11-20 14:15:58 INFO: Downloading default packages for language: it (Italian) ...
2024-11-20 14:15:59 INFO: File exists: C:\Users\fabia\stanza_resources\it\default.zip
2024-11-20 14:16:04 INFO: Finished downloading models and saved to C:\Users\fabia\stanza_resources
2024-11-20 14:16:04 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-20 14:16:04 INFO: Downloaded file to C:\Users\fabia\stanza_resources\resources.json
2024-11-20 14:16:05 INFO: Loading these models for language: it (Italian):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-11-20 14:16:05 INFO: Using device: cpu
2024-11-20 14:16:05 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-20 14:16:07 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-20 14:16:07 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2024-11-20 14:16:07 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-20 14:16:07 INFO: Done 

In [4]:
from tqdm import tqdm

# Initialize tqdm progress bar
tqdm.pandas()

# Update functions to use tqdm for progress tracking
df['subject_omission'] = df['italian'].progress_apply(has_subject_omission)
df['reflexive_construction'] = df['italian'].progress_apply(has_reflexive_construction)
df['double_negation'] = df['italian'].progress_apply(has_double_negation)
df['diminutives_augmentatives'] = df['italian'].progress_apply(has_diminutives_or_augmentatives)
df['clitic_pronouns'] = df['italian'].progress_apply(has_clitic_pronouns)

100%|██████████| 17525/17525 [18:53<00:00, 15.46it/s]
100%|██████████| 17525/17525 [00:00<00:00, 282496.00it/s]
100%|██████████| 17525/17525 [00:00<00:00, 532653.93it/s]
100%|██████████| 17525/17525 [00:00<00:00, 155731.31it/s]
100%|██████████| 17525/17525 [00:00<00:00, 143178.33it/s]


In [5]:
phenomena_counter(df)

{'Rows': 17525,
 'Sentences': 16683,
 'Subject Omission': np.int64(5223),
 'Reflexive Construction': np.int64(852),
 'Double Negation': np.int64(37),
 'Diminutives and Augmentatives': np.int64(623),
 'Clitic Pronouns': np.int64(2412),
 'Other': 9770}

## Split the dataset into single and multiple translations

In [6]:
multiple_translation_df = df[df['nl_id'].isin(df['nl_id'].value_counts()[lambda x: x > 1].index)]
phenomena_counter(multiple_translation_df)

{'Rows': 9636,
 'Sentences': 9191,
 'Subject Omission': np.int64(3260),
 'Reflexive Construction': np.int64(361),
 'Double Negation': np.int64(27),
 'Diminutives and Augmentatives': np.int64(257),
 'Clitic Pronouns': np.int64(1216),
 'Other': 5211}

In [7]:
df = df[~df.index.isin(multiple_translation_df.index)]
phenomena_counter(df)

{'Rows': 7889,
 'Sentences': 7585,
 'Subject Omission': np.int64(1963),
 'Reflexive Construction': np.int64(491),
 'Double Negation': np.int64(10),
 'Diminutives and Augmentatives': np.int64(366),
 'Clitic Pronouns': np.int64(1196),
 'Other': 4559}

## Create a curated dataset with a balanced distribution of phenomena

In [8]:
def sample_and_remove(source_df, target_df, column_name, n):
    if len(source_df[source_df[column_name]]) < n:
        n = len(source_df[source_df[column_name]])
    sample = source_df[source_df[column_name]].sample(n=n, replace=False)
    source_df = source_df.drop(sample.index)
    target_df = pd.concat([target_df, sample], ignore_index=True)
    return source_df, target_df, n

curated_df = pd.DataFrame(columns=df.columns)

phenomena = [
    'subject_omission',
    'reflexive_construction',
    'double_negation',
    'diminutives_augmentatives',
    'clitic_pronouns'
]

for phenomenon in phenomena:
    multiple_translation_df, curated_df, n = sample_and_remove(multiple_translation_df, curated_df, phenomenon, 60)
    df, curated_df, m = sample_and_remove(df, curated_df, phenomenon, 120 - n)
    if m + n != 120:
      print(f"Missing {120 - m - n} sentences for {phenomenon}\n")

Missing 84 sentences for double_negation



In [9]:
phenomena_counter(curated_df)

{'Rows': 516,
 'Sentences': 511,
 'Subject Omission': 191,
 'Reflexive Construction': 149,
 'Double Negation': 37,
 'Diminutives and Augmentatives': 136,
 'Clitic Pronouns': 237,
 'Other': 0}

## Add random sentences for variety

In [10]:
curated_df = pd.concat([curated_df, df.sample(n=400, replace=False)], ignore_index=True)
phenomena_counter(curated_df)

{'Rows': 916,
 'Sentences': 911,
 'Subject Omission': 291,
 'Reflexive Construction': 170,
 'Double Negation': 37,
 'Diminutives and Augmentatives': 150,
 'Clitic Pronouns': 287,
 'Other': 245}

## Save the curated dataset

In [14]:
curated_df.drop(columns=[
    'subject_omission',
    'reflexive_construction',
    'double_negation',
    'diminutives_augmentatives',
    'clitic_pronouns'
], inplace=True, errors='ignore')
curated_df.to_csv('Datasets/Curated-IT-NL', sep='\t', index=False)