In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '20'

import numpy as np
import pandas as pd
import re
import csv
from parsivar import Normalizer
import tensorflow_text as tf_text

In [2]:
def count_english_words(row):
    if row is None:
        return 10
    words = row.split(' ')
    cnt = 0
    for word in words:
        if re.match('[a-zA-z]+', word):
            cnt += 1
            
    return cnt

In [3]:
def read_data_from_dir(directory='./'):
    files = os.listdir(directory)
    data = []
    for file in files:
        if 'done' not in file: 
            continue
        with open(file, 'r') as txtfile:
            data.append(txtfile.read())

    data = '\n'.join(data).split('\n')
    data = [x.split('\t')[:2] for x in data]
    df = pd.DataFrame(data, columns=['Source', 'Target'])
    return df

In [4]:
MAX_SOURCE_LEN = 30

fa_normalizer = Normalizer()

data = read_data_from_dir()
data['Source'] = data['Source'].str.strip().replace('...', ' ')
data['Target'] = data['Target'].str.strip().replace('...', ' ')
data['Target'] = data['Target'].map(lambda x: fa_normalizer.normalize(x) if x is not None else None)
data = data[data['Target'].map(lambda x: count_english_words(x) == 0)]
data['Target'] = data['Target'].map(lambda x: re.sub('\u200c', ' ', x))
data = data[data['Source'].map(lambda x: len(x.split(' ')) <= MAX_SOURCE_LEN)]
data = data.drop_duplicates(keep='first')
print(len(data))
data = data.dropna()
data

297378


Unnamed: 0,Source,Target
0,the current king is only kim myung 's puppet .,پادشاه فعلی فقط عروسک خیمه شب بازی کیم میونگ ا...
1,"how could he ask for consul jang 's help ,",چگونه می توانست از کنسول جانگ کمک بخواهد ،
2,when he must know what happened between kim my...,وقتی او باید بداند بین کیم میونگ و کنسول جانگ ...
3,but he said in the letter that his life was in...,اما در نامه گفته است که جانش در خطر است .
4,he would seek help anywhere if his life were i...,اگر جانش در خطر بود از هر جا کمک می گرفت .
...,...,...
553230,they heard the cause of the disturbance.,علت اغتشاش را شنیدند .
553502,Madame de Saint Meran two months since;,مادام دو سنت مران دو ماه از آن زمان ;
553581,"Have you, then, power against death?",پس آیا قدرتی در برابر مرگ دارید ؟
553626,Have you something to tell me? asked he.,چیزی دارید که به من بگویید ؟ از او پرسید .


In [5]:
data.to_csv('combined.txt', sep='\t', header=False, index=False, encoding='utf-8')