In [23]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '20'

import numpy as np
import pandas as pd
import re
import csv
from parsivar import Normalizer
import unicodedata

In [24]:
def is_english(text: str) -> bool:
    # Define a regular expression pattern for English characters
    pattern = re.compile(r'^[A-Za-z0-9\s&,.\'\"!?;:()\-<>//+=*%$#@!\[\]€]+$')
    
    # Check if the text matches the pattern
    if pattern.match(text):
        return True
    return False

In [25]:
def count_english_words(row):
    if row is None:
        return 10
    words = row.split(' ')
    cnt = 0
    for word in words:
        if re.match('[a-zA-z]+', word):
            cnt += 1
            
    return cnt

In [26]:
def read_data_from_dir(directory='./'):
    files = os.listdir(directory)
    data = []
    for file in files:
        if 'done' not in file: 
            continue
        with open(file, 'r') as txtfile:
            data.append(txtfile.read())

    data = '\n'.join(data).split('\n')
    data = [x.split('\t')[:2] for x in data]
    df = pd.DataFrame(data, columns=['Source', 'Target'])
    return df

In [27]:
MAX_SOURCE_LEN = 4000

fa_normalizer = Normalizer()

data = read_data_from_dir()
data['Source'] = data['Source'].str.strip().replace('...', ' ')
data['Target'] = data['Target'].str.strip().replace('...', ' ')
data['Target'] = data['Target'].map(lambda x: fa_normalizer.normalize(x) if x is not None else None)
data = data.dropna()
data['Target'] = data['Target'].map(lambda x: re.sub('\u200c', ' ', x))
data = data[data['Source'].map(lambda x: len(x.split(' ')) <= MAX_SOURCE_LEN)]
data = data.drop_duplicates(keep='first')
print(len(data))
data

3997465


Unnamed: 0,Source,Target
0,flash fire .,فلاش آتش .
1,superheats the air . burns the lungs like rice...,هوا را فوق العاده گرم می کند . ریه ها را مثل ک...
2,"hey , guys . down here . down here .",سلام بچه ها . این پایین . این پایین .
3,what do you got down this corridor is the bow ...,چه چیزی در این راهرو پایین آمده است ، درست است .
4,theres an access hatch right there that puts u...,یک دریچه دسترسی درست در آنجا وجود دارد که ما ر...
...,...,...
5157124,The devices are connected in a daisy chain tal...,دستگاه ها به صورت زنجیره ای به هم متصل شده اند...
5157125,Learn more about our SC Net,درباره SC Net ما بیشتر بیاموزید
5157130,Premium quality third party products.,محصولات شخص ثالث با کیفیت برتر .
5157132,Compact. Lightweight. Protected.,فشرده . سبک وزن . محافظت شده است .


In [28]:
def remove_diacritics(text):
    # Normalize the text to decompose characters with diacritics into base characters and combining characters
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Remove the combining characters
    without_diacritics = ''.join(c for c in normalized_text if not unicodedata.combining(c))
    
    return without_diacritics

data['Source'] = data['Source'].map(remove_diacritics)
data['Source'] = data['Source'].map(lambda x: re.sub('—|–', '-', x))
data['Source'] = data['Source'].map(lambda x: re.sub('»', ' ', x))
data['Source'] = data['Source'].map(lambda x: re.sub('”', '"', x))
data['Source'] = data['Source'].map(lambda x: re.sub('“', '"', x))
data['Source'] = data['Source'].map(lambda x: re.sub('‘', "'", x))
data['Source'] = data['Source'].map(lambda x: re.sub('’', "'", x))
data['Source'] = data['Source'].map(lambda x: re.sub('•|§', " ", x))

In [29]:
to_remove = []
for i in range(data.shape[0]):
    text = data.iloc[i].Source
    index = int(data.iloc[i].name)
    if not is_english(text):
        to_remove.append(index)
    print(f'\rChecked {i+1:>8}', end='')

Checked  3997465

In [30]:
int(data.iloc[20000].name)

20172

In [31]:
len(to_remove)

37292

In [32]:
data = data[~data.index.isin(to_remove)]

In [33]:
data.to_csv('combined.tsv', sep='\t', header=False, index=False, encoding='utf-8')