In [23]:
import pandas as pd

## Load the Italian-Dutch parallel corpus

In [24]:
columns = ['it_id', 'italian', 'nl_id', 'dutch']
df = pd.read_csv('Datasets/Tatoeba-it-nl.tsv', sep='\t', header=None)
df.columns = columns
df.head()

Unnamed: 0,it_id,italian,nl_id,dutch
0,4369,Devo andare a dormire.,5966,Ik moet gaan slapen.
1,4371,Che cos'è?,5970,Wat is dat?
2,4373,"La parola d'accesso è ""Muiriel"".",5985,"Het wachtwoord is ""Muiriel""."
3,4375,Non cambierà niente.,379556,Dat zal niets aan de zaak veranderen.
4,4376,Costerà trenta euro.,378907,"Dat zal € 30,- kosten."


## Define function to count the phenomena frequencies in the dataset

In [25]:
import stanza

# Load the Italian language model in stanza
stanza.download('it')
nlp = stanza.Pipeline(lang='it', processors='tokenize,mwt,pos,lemma')

# Helper functions for phenomena detection
def has_subject_omission(sentence):
    """Check if the first word of the sentence is a verb."""
    if not isinstance(sentence, str):
        return False
    doc = nlp(sentence)
    first_word_pos = doc.sentences[0].words[0].upos if doc.sentences and doc.sentences[0].words else None
    return first_word_pos == "VERB" or first_word_pos == "AUX"

def has_reflexive_construction(sentence):
    """Check if the sentence contains reflexive pronouns."""
    reflexive_pronouns = ["mi", "ti", "si", "ci", "vi"]
    return any(pronoun in sentence.split() for pronoun in reflexive_pronouns)

def has_double_negation(sentence):
    """Check if the sentence contains double negation."""
    negative_words = ["niente", "nessuno", "nessuna", "nessun", "nulla"]
    words = sentence.lower().split()
    return words.count("non") > 1 or "non" in words and any(word in words for word in negative_words)

def has_diminutives_or_augmentatives(sentence):
    """Check if the sentence contains diminutives or augmentatives."""
    suffixes = ["ino", "etto", "accio", "one"]
    return any(word.endswith(suffix) for suffix in suffixes for word in sentence.split())

def has_clitic_pronouns(sentence):
    """Check if the sentence contains clitic pronouns."""
    clitic_pronouns = ["lo", "la", "li", "gli", "le", "ne", "ci", "mi", "ti",
    "glielo", "gliela", "glieli", "gliele"]
    return any(pronoun in sentence.split() for pronoun in clitic_pronouns)

def phenomena_counter(df):
  phenomena_counts = {
    "Rows": len(df),
    "Sentences": len(df['it_id'].unique()),
    "Subject Omission": int(df['subject_omission'].sum()),
    "Reflexive Construction": int(df['reflexive_construction'].sum()),
    "Double Negation": int(df['double_negation'].sum()),
    "Diminutives and Augmentatives": int(df['diminutives_augmentatives'].sum()),
    "Clitic Pronouns": int(df['clitic_pronouns'].sum()),
    "Other": len(df[~(df[['subject_omission', 'reflexive_construction', 'double_negation',
                          'diminutives_augmentatives', 'clitic_pronouns']].any(axis=1))])
  }

  return phenomena_counts

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-24 23:38:55 INFO: Downloaded file to C:\Users\fabia\stanza_resources\resources.json
2024-11-24 23:38:55 INFO: Downloading default packages for language: it (Italian) ...
2024-11-24 23:38:56 INFO: File exists: C:\Users\fabia\stanza_resources\it\default.zip
2024-11-24 23:38:59 INFO: Finished downloading models and saved to C:\Users\fabia\stanza_resources
2024-11-24 23:38:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-24 23:38:59 INFO: Downloaded file to C:\Users\fabia\stanza_resources\resources.json
2024-11-24 23:38:59 INFO: Loading these models for language: it (Italian):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-11-24 23:38:59 INFO: Using device: cpu
2024-11-24 23:38:59 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-24 23:38:59 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-24 23:38:59 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2024-11-24 23:39:00 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-24 23:39:00 INFO: Done 

In [26]:
from tqdm import tqdm

# Initialize tqdm progress bar
tqdm.pandas()

# Update functions to use tqdm for progress tracking
df['subject_omission'] = df['italian'].progress_apply(has_subject_omission)
df['reflexive_construction'] = df['italian'].progress_apply(has_reflexive_construction)
df['double_negation'] = df['italian'].progress_apply(has_double_negation)
df['diminutives_augmentatives'] = df['italian'].progress_apply(has_diminutives_or_augmentatives)
df['clitic_pronouns'] = df['italian'].progress_apply(has_clitic_pronouns)

df.to_csv('Datasets/Analysis-IT-NL.tsv', sep='\t', index=False)

100%|██████████| 17525/17525 [17:04<00:00, 17.11it/s] 
100%|██████████| 17525/17525 [00:00<00:00, 254776.03it/s]
100%|██████████| 17525/17525 [00:00<00:00, 316037.12it/s]
100%|██████████| 17525/17525 [00:00<00:00, 117639.72it/s]
100%|██████████| 17525/17525 [00:00<00:00, 164340.01it/s]


In [28]:
phenomena_counter(df)

{'Rows': 17525,
 'Sentences': 16683,
 'Subject Omission': 5223,
 'Reflexive Construction': 852,
 'Double Negation': 110,
 'Diminutives and Augmentatives': 623,
 'Clitic Pronouns': 2412,
 'Other': 9718}

## Check Diminutives and Augmentatives

In [29]:
diminutives_augmentatives = [
    word
    for words in df[df['diminutives_augmentatives']]['italian'].str.split()
    for word in words
    if word.endswith("ino") or word.endswith("etto") or word.endswith("accio") or word.endswith("one")
]

set(diminutives_augmentatives)

{'"copione',
 '"oggetto',
 'Accetto',
 'Aladino',
 'Ammetto',
 'Aspetto',
 'Attenzione',
 'Beniamino',
 'Berlino',
 'Buone',
 'Dublino',
 'Giappone',
 'Immagino',
 "L'amichetto",
 "L'architetto",
 "L'assassino",
 "L'espressione",
 "L'ottone",
 "L'ucraino",
 "L'unione",
 'L’Unione',
 'Napoleone',
 'Organizzazione',
 'Pechino',
 'Persino',
 'Platone',
 'Plutone',
 'Progetto',
 'Prometto',
 'Proviamone',
 'Rispetto',
 'Valentino',
 'ammetto',
 'applicazione',
 'architetto',
 'asino',
 'aspetto',
 'attenzione',
 'azione',
 'bambino',
 'biglietto',
 'braccio',
 'buone',
 'canzone',
 'carbone',
 'carnagione',
 'cassetto',
 'cittadino',
 'colazione',
 'collezione',
 'compassione',
 'compone',
 'concetto',
 'connessione',
 'conversazione',
 'costruzione',
 'cotone',
 'creazione',
 'cucino',
 'decisione',
 'delfino',
 "dell'Unione",
 "dell'effetto",
 "dell'osservazione",
 'destinazione',
 'destino',
 'detto',
 'direzione',
 'distinzione',
 'distrazione',
 'edizione',
 'effetto',
 'faccio',
 'fa

### Adjusting the flag based on the actual diminutives and augmentatives

In [30]:
actual_diminutives_augmentatives = ["L'amichetto", 'laghetto', 'modellino', 'motorino', 'orsetto', 'ragazzino']

df.loc[
    df['diminutives_augmentatives'] & ~df['italian'].str.contains('|'.join(actual_diminutives_augmentatives), na=False),
    'diminutives_augmentatives'
] = False
phenomena_counter(df)

{'Rows': 17525,
 'Sentences': 16683,
 'Subject Omission': 5223,
 'Reflexive Construction': 852,
 'Double Negation': 110,
 'Diminutives and Augmentatives': 7,
 'Clitic Pronouns': 2412,
 'Other': 10056}

## Split the dataset into single and multiple translations

In [31]:
multiple_translation_df = df[df['nl_id'].isin(df['nl_id'].value_counts()[lambda x: x > 1].index)]
phenomena_counter(multiple_translation_df)

{'Rows': 9636,
 'Sentences': 9191,
 'Subject Omission': 3260,
 'Reflexive Construction': 361,
 'Double Negation': 69,
 'Diminutives and Augmentatives': 0,
 'Clitic Pronouns': 1216,
 'Other': 5318}

In [32]:
df = df[~df.index.isin(multiple_translation_df.index)]
phenomena_counter(df)

{'Rows': 7889,
 'Sentences': 7585,
 'Subject Omission': 1963,
 'Reflexive Construction': 491,
 'Double Negation': 41,
 'Diminutives and Augmentatives': 7,
 'Clitic Pronouns': 1196,
 'Other': 4738}

## Create a curated dataset with a balanced distribution of phenomena

In [33]:
def sample_and_remove(source_df, target_df, column_name, n):
    if len(source_df[source_df[column_name]]) < n:
        n = len(source_df[source_df[column_name]])
    sample = source_df[source_df[column_name]].sample(n=n, replace=False)
    source_df = source_df.drop(sample.index)
    target_df = pd.concat([target_df, sample], ignore_index=True)
    return source_df, target_df, n

curated_df = pd.DataFrame(columns=df.columns)

phenomena = [
    'subject_omission',
    'reflexive_construction',
    'double_negation',
    'diminutives_augmentatives',
    'clitic_pronouns'
]

for phenomenon in phenomena:
    multiple_translation_df, curated_df, n = sample_and_remove(multiple_translation_df, curated_df, phenomenon, 60)
    df, curated_df, m = sample_and_remove(df, curated_df, phenomenon, 120 - n)
    if m + n != 120:
      print(f"Missing {120 - m - n} sentences for {phenomenon}\n")

Missing 19 sentences for double_negation

Missing 113 sentences for diminutives_augmentatives



## Add the generated sentences

In [40]:
generated_df = pd.read_csv('Datasets/Double-Negation-Generated-IT-NL.tsv', sep='\t')
tmp = pd.read_csv('Datasets/Diminutives-Augmentatives-Generated-IT-NL.tsv', sep='\t')
generated_df = pd.concat([generated_df, tmp], ignore_index=True)

for _, row in generated_df.iterrows():
    sentence = row['italian']
    translation = row['dutch']
    
    if sentence in curated_df['italian'].values:
        it_id = curated_df.loc[curated_df['italian'] == sentence, 'it_id'].values[0]
    else:
        it_id = curated_df['it_id'].max() + 1 if not curated_df.empty else 1  
    
    if translation in curated_df['dutch'].values:
        nl_id = curated_df.loc[curated_df['dutch'] == translation, 'nl_id'].values[0]
    else:
        nl_id = curated_df['nl_id'].max() + 1 if not curated_df.empty else 1

    new_row = pd.DataFrame({'it_id': [it_id], 'italian': [sentence], 'nl_id': [nl_id], 'dutch': [translation]})
    curated_df = pd.concat([curated_df, new_row], ignore_index=True)

{'Rows': 600,
 'Sentences': 591,
 'Subject Omission': 156,
 'Reflexive Construction': 154,
 'Double Negation': 103,
 'Diminutives and Augmentatives': 7,
 'Clitic Pronouns': 231,
 'Other': 132}

## Add random sentences for variety

In [41]:
curated_df = pd.concat([curated_df, df.sample(n=400, replace=False)], ignore_index=True)
len(curated_df)

1000

## Save the curated dataset

In [45]:
curated_df.drop(columns=[
    'subject_omission',
    'reflexive_construction',
    'double_negation',
    'diminutives_augmentatives',
    'clitic_pronouns'
], inplace=True, errors='ignore')
curated_df.to_csv('Datasets/Curated-IT-NL.tsv', sep='\t', index=False)