In [72]:
import requests
from zipfile import ZipFile
import pandas as pd
import polars as pl
import numpy as np
import pyarrow as pa

In [73]:
"""
    Read the es-en.txt, this is a database English-Spanish from  https://europat.net
    the file has a huge size, so we will use polars to read it.    
"""

'\n    Read the es-en.txt, this is a database English-Spanish from  https://europat.net\n    the file has a huge size, so we will use polars to read it.    \n'

In [74]:
df_euro_pat = pl.read_csv('es-en.txt', sep='\t', n_rows=1_000_000)

In [75]:
df_euro_pat.columns = ['spa', 'eng']

In [76]:
# move eng->spa and spa->eng
df_euro_pat = df_euro_pat.with_columns([pl.col('spa').alias('eng'), pl.col('eng').alias('spa')])
df_euro_pat.head()

spa,eng
str,str
"""The styrene co...","""El contenido d..."
"""Adaptable supp...","""Dispositivo de..."
"""1, a communica...","""1, un sistema ..."
"""Immunoblotting...","""La inmunotrans..."
"""Statistical an...","""Análisis estad..."


In [77]:
df_euro_pat.columns = ["eng", "spa"]
df_euro_pat.head()

eng,spa
str,str
"""The styrene co...","""El contenido d..."
"""Adaptable supp...","""Dispositivo de..."
"""1, a communica...","""1, un sistema ..."
"""Immunoblotting...","""La inmunotrans..."
"""Statistical an...","""Análisis estad..."


In [78]:
df_euro_pat.describe()

describe,eng,spa
str,str,str
"""count""","""1000000""","""1000000"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""",""""" Alkanyl "" by...","""! 20-80 % de a..."
"""max""","""〈Second Embodi...","""◦ calcular una..."
"""median""",,


In [79]:
df_euro_pat.write_parquet('es-en.parquet')

In [80]:
df_euro_pat.write_csv('es-en_1.txt', separator='\t')

In [81]:
file_name = "spa-eng.zip"

url = f'https://www.manythings.org/anki/{file_name}'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

r = requests.get(url, headers=headers)
# save the file:
with open(file_name, 'wb') as file:
    file.write(r.content)
    
with ZipFile(file_name) as zip_file:
    zip_file.extractall("./spa-eng")

df_many_things = pd.read_csv('spa-eng/spa.txt', sep='\t', header=None)
df_many_things.columns = ['eng', 'spa', 'ignore']
df_many_things = df_many_things[['eng', 'spa']]
df_many_things.to_csv('spa-eng/spa.txt', sep='\t', index=False)

In [82]:
df_many_things = pd.read_csv('spa-eng/spa.txt', sep='\t')

In [83]:
df_many_things.describe()

Unnamed: 0,eng,spa
count,139636,139636
unique,118570,131221
top,You can put it there.,Estoy quebrado.
freq,68,12


In [84]:
# merge  datasets 
df_many_things_polars = pl.from_pandas(df_many_things)
df_many_things_polars.head()

eng,spa
str,str
"""Go.""","""Ve."""
"""Go.""","""Vete."""
"""Go.""","""Vaya."""
"""Go.""","""Váyase."""
"""Hi.""","""Hola."""


In [85]:
## Concat datasets
# Note that one columns has eng->spa and the other spa->eng
df_general = pl.concat([df_euro_pat, df_many_things_polars], )

In [86]:
df_general.describe()

describe,eng,spa
str,str,str
"""count""","""1139636""","""1139636"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""",""""" Alkanyl "" by...","""! 20-80 % de a..."
"""max""","""〈Second Embodi...","""◦ calcular una..."
"""median""",,
