In [1]:
import fitz  # PyMuPDF

doc = fitz.open("dictionary.pdf")
for page in doc:
    text = page.get_text()
    print(text)

Swedish-English Vocabulary
297 
 
Jordbruksstatistisk sammanställning 2020 
 
Swedish-English vocabulary 
Letter Åå, Ää and Öö at the end of the list 
The English translations are not always word by word 
 
A 
Administrativa områden 
Administrative areas 
Agronom 
Agronomist; M. Sc. in Agriculture 
Aktiebolag 
Limited company 
Aktiv substans 
Active substance 
Aktuell 
Current; present 
Alkoholfria drycker 
Non-alcoholic beverages 
Alkoholhaltiga drycker 
Alcoholic beverages 
Allmän 
General; common 
Alvarbete 
Grazed alvar 
Am- och diko bidrag 
Suckler cow premium 
Amkor 
Suckler cows 
Ammoniak 
Ammonia 
Andel 
Proportion; share; percentage 
Andra 
Other 
Andra orsaker 
Other reasons 
Anhöriga 
Relatives; family members 
Animalier 
Animal products 
Animalisk 
Animal  
Anläggning 
Plant; works 
Anläggningsstöd 
Grant for afforestation and foundation of wetlands 
Anmälan 
Application 
Annan 
Other 
Annan djurskötsel 
Other kind of animal husbandry 
Annan mark 
Other ground 
Annan sallat

In [1]:
import gzip
import os
import pickle
from lxml import etree
from tqdm.notebook import tqdm

def parse_large_tmx(file_path, cache_path="en-sv.pkl"):
    if os.path.exists(cache_path):
        print(f"Loading from cache: {cache_path}")
        with open(cache_path, "rb") as f:
            return pickle.load(f)

    translations = []
    skipped = 0
    
    with gzip.open(file_path, 'rb') as f:
        context = etree.iterparse(f, events=('end',), tag='tu')
        
        for _, elem in tqdm(context, desc="Parsing TMX"):
            try:
                en_seg = elem.find('.//tuv[@language="en"]/seg') or \
                         elem.find('.//tuv[1]/seg')
                sv_seg = elem.find('.//tuv[@language="sv"]/seg') or \
                         elem.find('.//tuv[2]/seg')
                
                if en_seg is not None and sv_seg is not None:
                    translations.append((en_seg.text, sv_seg.text))
                else:
                    skipped += 1
                    
            except Exception:
                skipped += 1
            finally:
                elem.clear()
                while elem.getprevious() is not None:
                    del elem.getparent()[0]
    
    print(f"Skipped {skipped} malformed entries")
    print(f"Caching parsed data to: {cache_path}")
    with open(cache_path, "wb") as f:
        pickle.dump(translations, f)

    return translations

# Usage
file_path = "en-sv.tmx.gz"
translations = parse_large_tmx(file_path)
print(f"Successfully extracted {len(translations)} pairs")

Parsing TMX: 0it [00:00, ?it/s]

Skipped 0 malformed entries
Caching parsed data to: en-sv.pkl
Successfully extracted 43533711 pairs


In [1]:
import pickle

# Path to your cached file
cache_path = "en-sv.pkl"

# Load the file
with open(cache_path, "rb") as f:
    translations = pickle.load(f)

# Now `translations` is a list of (en_text, sv_text) tuples
print(f"Loaded {len(translations)} translation pairs")
print(translations[:5])  # print first 5 pairs

Loaded 43533711 translation pairs
[('Previously on The Hot Zone: Anthrax.', 'I tidigare avsnitt...'), ('Director Mueller just assigned us a major case number.', 'Byråchef Mueller gav oss just ett stort fall.'), ("Investigation''s  officially been dubbed Amerithrax.", 'Utredningen har fått namnet Amerithrax.'), ('Whoever sent these  letters got their Anthrax from an American lab.', 'Brevskickaren fick sin mjältbrand från ett amerikanskt labb.'), ("We wouldn''t be here if we didn''t have evidence leading us back to USAMRIID.", 'Vi hade inte varit här om inte bevisen pekat på USAMRIID.')]


In [1]:
import pandas as pd
import pickle

with open("en-sv.pkl", "rb") as f:
    data = pickle.load(f)

# Convert to a DataFrame
df = pd.DataFrame(data, columns=["en", "sv"])

# Now you can use .info()
df.info()

# Optional: Preview a few rows
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43533711 entries, 0 to 43533710
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   en      object
 1   sv      object
dtypes: object(2)
memory usage: 664.3+ MB
                                                  en  \
0               Previously on The Hot Zone: Anthrax.   
1  Director Mueller just assigned us a major case...   
2  Investigation''s  officially been dubbed Ameri...   
3  Whoever sent these  letters got their Anthrax ...   
4  We wouldn''t be here if we didn''t have eviden...   

                                                  sv  
0                              I tidigare avsnitt...  
1      Byråchef Mueller gav oss just ett stort fall.  
2            Utredningen har fått namnet Amerithrax.  
3  Brevskickaren fick sin mjältbrand från ett ame...  
4  Vi hade inte varit här om inte bevisen pekat p...  


In [2]:
df.head()

Unnamed: 0,en,sv
0,Previously on The Hot Zone: Anthrax.,I tidigare avsnitt...
1,Director Mueller just assigned us a major case...,Byråchef Mueller gav oss just ett stort fall.
2,Investigation''s officially been dubbed Ameri...,Utredningen har fått namnet Amerithrax.
3,Whoever sent these letters got their Anthrax ...,Brevskickaren fick sin mjältbrand från ett ame...
4,We wouldn''t be here if we didn''t have eviden...,Vi hade inte varit här om inte bevisen pekat p...


In [3]:
# Remove rows where 'sv' (Swedish sentences) or 'en' (English sentences) are NaN or None
df = df.dropna(subset=['sv', 'en'])

# Verify that no missing values are present
print(df.isnull().sum())

en    0
sv    0
dtype: int64


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Convert the Swedish sentences into TF-IDF vectors
sv_vectors = vectorizer.fit_transform(df['sv'])

# Check the shape of the TF-IDF matrix
print(f"Shape of the TF-IDF matrix: {sv_vectors.shape}")

Shape of the TF-IDF matrix: (43533694, 1505881)


In [5]:
from sklearn.cluster import KMeans

# Set the number of clusters (e.g., 5 clusters)
num_clusters = 5

# Initialize the KMeans model
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model to the Swedish sentence vectors
df['cluster'] = kmeans.fit_predict(sv_vectors)

# Check the clusters assigned to each sentence
print(df[['sv', 'en', 'cluster']].head())

                                                  sv  \
0                              I tidigare avsnitt...   
1      Byråchef Mueller gav oss just ett stort fall.   
2            Utredningen har fått namnet Amerithrax.   
3  Brevskickaren fick sin mjältbrand från ett ame...   
4  Vi hade inte varit här om inte bevisen pekat p...   

                                                  en  cluster  
0               Previously on The Hot Zone: Anthrax.        0  
1  Director Mueller just assigned us a major case...        0  
2  Investigation''s  officially been dubbed Ameri...        0  
3  Whoever sent these  letters got their Anthrax ...        0  
4  We wouldn''t be here if we didn''t have eviden...        0  


In [6]:
from sklearn.neighbors import NearestNeighbors

# Fit a KNN model on the Swedish sentence vectors (just for the Swedish part)
knn = NearestNeighbors(n_neighbors=1, metric="cosine")
knn.fit(sv_vectors)

# Define a translation function that uses clustering information
def translate(sv_input):
    # Transform the input sentence into a TF-IDF vector
    sv_input_vector = vectorizer.transform([sv_input])
    
    # Find the nearest neighbor cluster
    dist, idx = knn.kneighbors(sv_input_vector)
    
    # Get the cluster of the closest sentence
    cluster = df['cluster'].iloc[idx[0][0]]
    
    # Find the English translation of a sentence in the same cluster
    cluster_sentences = df[df['cluster'] == cluster]
    
    # Optionally: You can return a random translation from the same cluster
    translation = cluster_sentences['en'].sample(n=1).values[0]
    
    return translation

# Example usage
input_sv = "Jag älskar dig"  # Swedish sentence
output_en = translate(input_sv)
print(f"Swedish: {input_sv}\nEnglish: {output_en}")

Swedish: Jag älskar dig
English: ! - I don't remember!


In [7]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce dimensions to 2D for visualization using PCA
pca = PCA(n_components=2)
reduced = pca.fit_transform(sv_vectors.toarray())  # Convert to dense array for PCA

# Plot the clustering result
plt.figure(figsize=(10, 8))
plt.scatter(reduced[:, 0], reduced[:, 1], c=df['cluster'], cmap='viridis', alpha=0.6)
plt.title("Visualization of Swedish Sentence Clusters (PCA)")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.colorbar(label="Cluster")
plt.tight_layout()
plt.show()

MemoryError: Unable to allocate 477. TiB for an array with shape (43533694, 1505881) and data type float64