In [3]:
import pandas as pd
import re

def is_abbreviation(text):
    # Check if the text is in all caps and has no spaces
    return re.match(r'^[A-Z]+$', str(text)) is not None

def swap_if_abbreviation(row, changes):
    for lang in languages:
        col1 = f"{lang}.Term(1)"
        col2 = f"{lang}.Term(2)"
        if is_abbreviation(row[col1]):
            changes.append((lang, row[col1], row[col2]))
            row[col1], row[col2] = row[col2], row[col1]
    return row
    
# Load the CSV file into a DataFrame
file_path = "ED_Pub_Sachgebiet_multilingual2.csv"  # Replace with the path to your CSV file
df = pd.read_csv(file_path, delimiter=';')

# Define the languages and their column prefixes
languages = ['DES', 'ENG', 'FRS', 'ITS']

# List to store changes
changes = []

# Apply the function to each row
df = df.apply(lambda row: swap_if_abbreviation(row, changes), axis=1)

# Save the cleaned DataFrame back to a CSV file
output_file_path = "ED_Pub_Sachgebiet_multilingual2_cleaned3.csv"
df.to_csv(output_file_path, index=False, sep=';')

print(f"Cleaned CSV saved to {output_file_path}")

# Display the changes
print("\nChanges made:")
for change in changes:
    print(f"{change[0]}: Swapped '{change[1]}' with '{change[2]}'")


Cleaned CSV saved to ED_Pub_Sachgebiet_multilingual2_cleaned3.csv

Changes made:
DES: Swapped 'DIP' with 'Verein Diplomatinnen im EDA'
DES: Swapped 'CSPM' with 'konfliktsensitives Programmmanagement'
ITS: Swapped 'CSPM' with 'gestione dei programmi sensibile ai conflitti'
DES: Swapped 'EUA' with 'Europäische Umweltagentur'
ENG: Swapped 'EEA' with 'European Environment Agency'
FRS: Swapped 'AEE' with 'Agence européenne pour l'environnement'
ITS: Swapped 'AEA' with 'Agenzia europea dell'ambiente'
DES: Swapped 'IDA' with 'Internationale Entwicklungsorganisation'
ENG: Swapped 'IDA' with 'International Development Association'
FRS: Swapped 'AID' with 'Association internationale de développement'
ITS: Swapped 'AIS' with 'Associazione internazionale per lo sviluppo'
FRS: Swapped 'AIF' with 'avec incidences financières'
FRS: Swapped 'ZZZZZ' with 'nan'
DES: Swapped 'ZK' with 'Zuschlagskriterium'
FRS: Swapped 'CA' with 'critère d'adjudication'
ENG: Swapped 'OM' with 'operational management'
DES:

In [5]:
import pandas as pd
import re

def is_abbreviation(text):
    return re.match(r'^[A-Z]+$', str(text)) is not None

def swap_if_abbreviation(row, changes):
    for lang in languages:
        col1 = f"{lang}.Term(1)"
        col2 = f"{lang}.Term(2)"
        if is_abbreviation(row[col1]):
            changes.append(f"{lang}: Swapped '{row[col1]}' with '{row[col2]}'")
            row[col1], row[col2] = row[col2], row[col1]
    return row

# Load the CSV file into a DataFrame
file_path = "ED_Pub_Sachgebiet_multilingual2.csv"
df = pd.read_csv(file_path, delimiter=';')

# Define the languages
languages = ['DES', 'ENG', 'FRS', 'ITS']

# List to store changes
changes = []

# Apply the function to each row
df = df.apply(lambda row: swap_if_abbreviation(row, changes), axis=1)

# Save the cleaned DataFrame back to a CSV file
output_file_path = "ED_Pub_Sachgebiet_multilingual2_cleaned4.csv"
df.to_csv(output_file_path, index=False, sep=';')

# Write changes to a text file
changes_file_path = "changes.txt"
with open(changes_file_path, 'w') as f:
    f.write("Changes made:\n")
    for change in changes:
        f.write(change + "\n")

print(f"Cleaned CSV saved to {output_file_path}")
print(f"Changes saved to {changes_file_path}")


Cleaned CSV saved to ED_Pub_Sachgebiet_multilingual2_cleaned4.csv
Changes saved to changes.txt


In [6]:
import pandas as pd
import re
import unicodedata

def is_abbreviation(text):
    return re.match(r'^[A-Z]+$', str(text)) is not None

def fix_encoding(text):
    return unicodedata.normalize('NFKD', text.encode('utf-8').decode('utf-8', 'ignore'))

def swap_if_abbreviation(row, changes):
    for lang in languages:
        col1 = f"{lang}.Term(1)"
        col2 = f"{lang}.Term(2)"
        if is_abbreviation(row[col1]) and pd.notna(row[col2]) and row[col2] != "":
            changes.append(f"{lang}: Swapped '{row[col1]}' with '{row[col2]}'")
            row[col1], row[col2] = row[col2], row[col1]
    return row

# Load the CSV file into a DataFrame
file_path = "ED_Pub_Sachgebiet_multilingual2.csv"
df = pd.read_csv(file_path, delimiter=';', encoding='utf-8')

# Define the languages
languages = ['DES', 'ENG', 'FRS', 'ITS']

# Fix encoding for all columns
for col in df.columns:
    df[col] = df[col].apply(lambda x: fix_encoding(str(x)) if pd.notna(x) else x)

# List to store changes
changes = []

# Apply the function to each row
df = df.apply(lambda row: swap_if_abbreviation(row, changes), axis=1)

# Save the cleaned DataFrame back to a CSV file
output_file_path = "ED_Pub_Sachgebiet_multilingual2_cleaned5.csv"
df.to_csv(output_file_path, index=False, sep=';', encoding='utf-8')

# Write changes to a text file
changes_file_path = "changes2.txt"
with open(changes_file_path, 'w', encoding='utf-8') as f:
    f.write("Changes made:\n")
    for change in changes:
        f.write(change + "\n")

print(f"Cleaned CSV saved to {output_file_path}")
print(f"Changes saved to {changes_file_path}")


Cleaned CSV saved to ED_Pub_Sachgebiet_multilingual2_cleaned5.csv
Changes saved to changes2.txt
