In [1]:
import pandas as pd 
import numpy as np 

data_folder = "../data/semantic_alignment/"

def load_data(language: str): 
    # File paths for German (source) and English (target)
    lang_file = data_folder + f"en-{language}.txt/" + f"News-Commentary.en-{language}.{language}"
    en_file = data_folder + f"en-{language}.txt/" + f"News-Commentary.en-{language}.en"

    # Read files into lists
    with open(lang_file, "r", encoding="utf-8") as f_lang, open(en_file, "r", encoding="utf-8") as f_en:
        lang_sentences = f_lang.readlines()
        en_sentences = f_en.readlines()

    # Create a DataFrame
    df = pd.DataFrame({language: [s.strip() for s in lang_sentences], "en": [s.strip() for s in en_sentences]})

    return df

In [33]:
languages = ["de", "fr", "es", "zh"]

# load the data for each language and save the csv file under the data folder 
for lang in languages: 
    df = load_data(lang)
    df.to_csv(data_folder + f"{lang}.csv", index=False)
    print(f"Saved {lang}.csv")

Saved de.csv
Saved fr.csv
Saved es.csv
Saved zh.csv


In [50]:
import os
import random
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    AdamW,
)
from peft.tuners.lora import LoraConfig
from peft.mapping import get_peft_model
from peft.utils.peft_types import TaskType


model_path = "distilbert/distilbert-base-cased"
model_path = "distilbert/distilbert-base-multilingual-cased"
# Initialize model and tokenizer based on selection
tokenizer = AutoTokenizer.from_pretrained(model_path)

device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using device: {device}")

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=3,
).to(device)

# Apply LoRA adapters with CLI arguments
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=2,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=(["q_lin", "v_lin"]),
)
model = get_peft_model(model, peft_config)
model.to("cuda" if torch.cuda.is_available() else "cpu")

Using device: mps


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(119547, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=7

In [51]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(119547, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=7

In [46]:
# Get trainable parameters
trainable_params = []
all_param = 0
trainable_param = 0

for name, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params.append(name)
        trainable_param += param.numel()

print(f"trainable params: {trainable_param} || all params: {all_param} || trainable%: {100 * trainable_param / all_param}")
model.print_trainable_parameters()

trainable params: 629763 || all params: 66413574 || trainable%: 0.9482444055788957
trainable params: 629,763 || all params: 66,413,574 || trainable%: 0.9482


In [47]:
classifier_params = sum(p.numel() for p in model.base_model.model.classifier.parameters())
print(f"Number of parameters in classification layer: {classifier_params}")

Number of parameters in classification layer: 4614


In [48]:
pre_classifier_params = sum(p.numel() for p in model.base_model.model.pre_classifier.parameters())
print(f"Number of parameters in classification layer: {pre_classifier_params}")

Number of parameters in classification layer: 1181184


In [49]:
# Calculate trainable params excluding classifier and pre_classifier
classifier_params = sum(p.numel() for p in model.base_model.model.classifier.parameters())
pre_classifier_params = sum(p.numel() for p in model.base_model.model.pre_classifier.parameters())

trainable_wo_classifiers = trainable_param - classifier_params - pre_classifier_params
print(f"Trainable params excluding classifiers: {trainable_wo_classifiers}")

Trainable params excluding classifiers: -556035


In [6]:
# Load German-English dataset
de_df = pd.read_csv(data_folder + "de.csv")

# Set display options to show full content
pd.set_option('display.max_colwidth', None)

# Display first few examples
print("Sample German-English pairs:\n")
de_df.head(10)

Sample German-English pairs:



Unnamed: 0,de,en
0,Steigt Gold auf 10.000 Dollar?,"$10,000 Gold?"
1,"SAN FRANCISCO – Es war noch nie leicht, ein rationales Gespräch über den Wert von Gold zu führen.",SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.
2,"In letzter Zeit allerdings ist dies schwieriger denn je, ist doch der Goldpreis im letzten Jahrzehnt um über 300 Prozent angestiegen.","Lately, with gold prices up more than 300% over the last decade, it is harder than ever."
3,"Erst letzten Dezember verfassten meine Kollegen Martin Feldstein und Nouriel Roubini Kommentare, in denen sie mutig die vorherrschende optimistische Marktstimmung hinterfragten und sehr überlegt auf die Risiken des Goldes hinwiesen.","Just last December, fellow economists Martin Feldstein and Nouriel Roubini each penned op-eds bravely questioning bullish market sentiment, sensibly pointing out gold’s risks."
4,"Und es kam, wie es kommen musste.",Wouldn’t you know it?
5,Seit der Veröffentlichung ihrer Artikel ist der Goldpreis noch weiter gestiegen.,"Since their articles appeared, the price of gold has moved up still further."
6,Jüngst erreichte er sogar ein Rekordhoch von 1.300 Dollar.,"Gold prices even hit a record-high $1,300 recently."
7,"Im letzten Dezember argumentierten die Goldbugs, dass der Preis zweifellos in Richtung 2.000 Dollar gehen würde.","Last December, many gold bugs were arguing that the price was inevitably headed for $2,000."
8,"Beflügelt aufgrund des anhaltenden Aufwärtstrends, meint man nun mancherorts, dass Gold sogar noch höher steigen könnte.","Now, emboldened by continuing appreciation, some are suggesting that gold could be headed even higher than that."
9,"Ein erfolgreicher Gold-Investor erklärte mir vor kurzem, dass die Aktienkurse über ein Jahrzehnt dahingedümpelt waren, bevor der Dow Jones-Index in den frühen 1980er Jahren die Marke von 1.000 Punkten überschritt.","One successful gold investor recently explained to me that stock prices languished for a more than a decade before the Dow Jones index crossed the 1,000 mark in the early 1980’s."
