<a href="https://colab.research.google.com/github/CBaffelli/CAS-NLP_Machine-translation/blob/main/01_CAS_NLP_final_project_create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dataset pre-processing**

This script is used to create an initial dataset from the original corpus in JSON format.

In [None]:
#@title Imports and varia
import pandas as pd
import os
import numpy as np
import re
import string

In [None]:
#@title Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@title Load data
path_to_data = ''

Since the data is provided in several JSON files, we iterate through all of them, we group the data when possible, and we create a combined dataframe.

In [None]:
#Function to process each JSON file
def process_json_file(filename):
    df = pd.read_json(os.path.join(path_to_data, filename), encoding='utf-8')
    columns_to_drop = ['translationId', 'createdAt', 'fileType', 'origin', 'translationVendor', 'reviewStatus']
    df = df.drop(columns=columns_to_drop)
    grouped = df.groupby(['sourceExpression', 'sourceLanguage', 'targetExpression', 'targetLanguage'])['pimCode'].unique().apply(', '.join).reset_index()
    return grouped

In [None]:
#Iterate through JSON files and create the dataframe
dataframes = [process_json_file(filename) for filename in os.listdir(path_to_data) if filename.endswith('.json')]
#Concatenate all the dataframes
combined_df = pd.concat(dataframes, ignore_index=True)
#Reset the index of the modified dataframe
combined_df.reset_index(drop=True, inplace=True)

Once we have a combined dataframe, we can do some cleanup on it:


*   Harmonize the language codes to a standard mapping;
*   Ensure that "en-US" is always the source language, and if needed switch the data accordingly;
* Replace special characters with the correct encoding;
* Remove all markup tags (from XML);
* Remove unneeded spaces;




In [None]:
#Harmonize all language codes using a mapping
language_mapping = {
    'en': 'en-US',
    'de': 'de-DE',
    'da': 'da-DK',
    'cs': 'cs-CZ',
    'el': 'el-GR',
    'en-GB': 'en-US',
    'bg-bg': 'bg-BG',
    'de-de': 'de-DE',
    'fr': 'fr-FR',
    'vi': 'vi-VN',
    'it': 'it-IT',
    'ja': 'ja-JP',
    'pt': 'pt-PT',
    'ru': 'ru-RU',
    'sv': 'sv-SE',
    'no': 'nb-NO',
    'pl': 'pl-PL',
    'es': 'es-ES',
    'zh-Ha': 'zh-CN',
    'sk': 'sk-SK',
    'ro': 'ro-RO',
    'hu': 'hu-HU',
    'nb': 'nb-NO',
    'nn-NO': 'nb-NO',
    'lv': 'lv-LV',
    'fi': 'fi-FI',
    'et': 'et-EE',
    'zh': 'zh-CN',
    'lt': 'lt-LT',
    'ko': 'ko-KR',
    'sr': 'sr-RS',
    'es-x-int-SDL': 'es-ES',
    'es-419': 'es-ES',
    'sr-Latn-RS': 'sr-RS',
    'nl': 'nl-NL',
    'tr': 'tr-TR',
    'vi-VI' : 'vi-VN',
    'bg' : 'bg-BG'
}

combined_df['sourceLanguage'] = combined_df['sourceLanguage'].replace(language_mapping)
combined_df['targetLanguage'] = combined_df['targetLanguage'].replace(language_mapping)

In [None]:
#Filter the DataFrame based on condition that the sourceLanguage shall always be en-US. If otherwise, swap the languages.
filtered_df = combined_df[(combined_df['sourceLanguage'] == 'en-US') | ((combined_df['sourceLanguage'] != 'en-US') & (combined_df['targetLanguage'] == 'en-US'))]
filtered_df.loc[filtered_df['sourceLanguage'] != 'en-US', ['sourceExpression', 'targetExpression']] = filtered_df.loc[filtered_df['sourceLanguage'] != 'en-US', ['targetExpression', 'sourceExpression']].values
#Convert pimCode column to string
filtered_df['pimCode'] = filtered_df['pimCode'].astype(str)

In [None]:
#Replace some special characters
character_mapping = {
    '&amp;': '',
    '&lt;': '<',
    '&gt;': '>',
    '&quot;': '"',
    'Non Breaking Hyphen Tag Text' : ' ',
    '&apos;' : "'"
}

filtered_df['sourceExpression'] = filtered_df['sourceExpression'].replace(character_mapping, regex=True)
filtered_df['targetExpression'] = filtered_df['targetExpression'].replace(character_mapping, regex=True)

In [None]:
#Remove XML tags and markups
def remove_xml_tags(text):
    if isinstance(text, str):
        cleaned_text = re.sub(r'<[^>]+>', '', text)
        return cleaned_text
    else:
        return text

#Apply the function to the text column
filtered_df['sourceExpression'] = filtered_df['sourceExpression'].apply(remove_xml_tags)
filtered_df['targetExpression'] = filtered_df['targetExpression'].apply(remove_xml_tags)

In [None]:
#Remove newlines, tabs, and other spaces
filtered_df['sourceExpression'] = filtered_df['sourceExpression'].str.replace(r'\s+', ' ', regex=True).str.strip()
filtered_df['targetExpression'] = filtered_df['targetExpression'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [None]:
#Strip leading and trailing spaces in the dataframe
filtered_df = filtered_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
#Filter the dataframe to keep only non-empty values in both columns
filtered_df = filtered_df[(filtered_df['sourceExpression'] != '') & (filtered_df['targetExpression'] != '')]

In [None]:
#Define a regular expression pattern to match sentences with only special characters or numbers
pattern = r'^[\W\d]+$'

#Apply the pattern to both sourceExpression and targetExpression columns
mask = filtered_df['sourceExpression'].str.match(pattern) | filtered_df['targetExpression'].str.match(pattern)

#Filter out the rows that match the pattern
filtered_df = filtered_df[~mask]

In [None]:
#Aggregate the data again
#Convert all to string
filtered_df = filtered_df.astype(str)
aggregated_data = filtered_df.groupby(['sourceExpression', 'sourceLanguage', 'targetExpression', 'targetLanguage'])['pimCode'].unique().apply(', '.join).reset_index()

In [None]:
#Check if a sentence is more than 50% non-alphanumeric
def is_valid_sentence(row):
    source_sentence = row['sourceExpression']
    target_sentence = row['targetExpression']

    total_chars_source = len(source_sentence)
    non_alpha_chars_source = sum([1 for char in source_sentence if char not in string.ascii_letters])
    source_valid = non_alpha_chars_source / total_chars_source <= 0.4

    total_chars_target = len(target_sentence)
    non_alpha_chars_target = sum([1 for char in target_sentence if char not in string.ascii_letters])
    target_valid = non_alpha_chars_target / total_chars_target <= 0.4

    return source_valid and target_valid

def clean_dataset(df):
    cleaned_df = df[df.apply(is_valid_sentence, axis=1)]
    return cleaned_df

In [None]:
aggregated_data = clean_dataset(aggregated_data)

We save the full dataset

In [None]:
aggregated_data.to_csv('MT_dataset.csv', index=False)

For each language, we create a sub-dataset

In [None]:
#Create sub-datasets for each language pair
groups = aggregated_data.groupby(['sourceLanguage', 'targetLanguage'])
sub_datasets = {}
for (source_lang, target_lang), group in groups:
    #Filter out sourceExpressions and targetExpressions based on sentence length
    group['Source length'] = group['sourceExpression'].str.split().str.len()
    group['Target length'] = group['targetExpression'].str.split().str.len()
    group = group[group['Source length'] <80]
    group = group[group['Target length'] <80]
    group = group[group['Target length'] >5]
    group = group[group['Source length'] >5]
    # Remove the columns sourceLanguage and targetLanguage as they are not needed
    group = group.drop(['sourceLanguage', 'targetLanguage', 'Source length', 'Target length'], axis=1)

    # Add the filtered group to the sub-datasets dictionary
    sub_datasets[f'{source_lang}_{target_lang}'] = group

#In each sub-dataset, remove duplicates
#Save each sub-dataset to a CSV file
for language_pair, sub_dataset in sub_datasets.items():
    #Drop duplicates based on 'sourceExpression'
    sub_dataset.drop_duplicates(subset=['sourceExpression'], keep='first', inplace=True)
    #Drop duplicates based on 'targetExpression'
    sub_dataset.drop_duplicates(subset=['targetExpression'], keep='first', inplace=True)
    sub_dataset[['sourceExpression', 'targetExpression', 'pimCode']].to_csv(f'{language_pair}.csv', index=False)
