In [1]:
import pandas as pd
from collections import defaultdict

# Load the data from a .txt file
file_path = "fra_processed.txt"
delimiter = "\t"

# Read the file into a DataFrame
data = pd.read_csv(file_path, delimiter=delimiter, header=None, names=["source", "translation"])

# Create a simple word-to-word mapping based on frequency
def create_baseline_dict(data):
    word_map = defaultdict(lambda: defaultdict(int))

    # Count occurrences of word mappings
    for _, row in data.iterrows():
        source_words = row["source"].split()
        target_words = row["translation"].split()
        for source, target in zip(source_words, target_words):
            word_map[source][target] += 1

    # Create a dictionary mapping source words to the most frequent target word
    translation_dict = {source: max(targets, key=targets.get) for source, targets in word_map.items()}
    return translation_dict

# Translate a sentence using the baseline dictionary
def translate_sentence(sentence, translation_dict):
    words = sentence.split()
    translated = [translation_dict.get(word, word) for word in words]  # Default to word itself if not in dict
    return " ".join(translated)

In [2]:
# Main code
translation_dict = create_baseline_dict(data)

In [3]:
# Example: Translate a line from the source language
example_line = "this is an example line"
translated_line = translate_sentence(example_line, translation_dict)

print(f"Original: {example_line}")
print(f"Translated: {translated_line}")

Original: this is an example line
Translated: ce est un example ligne
