In [3]:
import re
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
from pandarallel import pandarallel


pandarallel.initialize(progress_bar=True, nb_workers=6)
pd.options.display.max_rows = 200

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
df = pd.read_csv("../raw/vef.csv")

df.head(3)

Unnamed: 0,id,line,chapter,chapter_id,strophe_id,line_id
0,1.1.1,"რომელმან შექმნა სამყარო ძალითა მით ძლიერითა,",დასაწყისი,1,1,1
1,1.1.2,"ზეგარდმო არსნი სულითა ყვნა ზეცით მონაბერითა,",დასაწყისი,1,1,2
2,1.1.3,"ჩვენ, კაცთა, მოგვცა ქვეყანა, გვაქვს უთვალავი ფ...",დასაწყისი,1,1,3


# Minimal 

In [None]:
with open("vef_full.txt", "w") as fout:
    fout.write(' '.join(df.line.values))

In [None]:
chapters = df.groupby('chapter_id')
tokenCorpus = []

for c in tqdm(df.chapter_id.unique()):
    doc = chapters.get_group(c)
    doc = doc.chapter.iloc[0] + ' ' + ' '.join(doc.line)
    doc = doc.lower()
    doc = re.sub(r"[^Ⴀ-ჿⴀ-ⴥᲐ-Ჿ]", ' ', doc)
    doc = re.sub(r"\s+", ' ', doc)
    doc = doc.split(' ')
    doc = [token for token in doc if token != '']
    tokenCorpus.append(doc)

tokenCorpus[0][:10]

100%|██████████| 65/65 [00:00<00:00, 1446.66it/s]


['დასაწყისი',
 'რომელმან',
 'შექმნა',
 'სამყარო',
 'ძალითა',
 'მით',
 'ძლიერითა',
 'ზეგარდმო',
 'არსნი',
 'სულითა']

In [None]:
wordfreq = defaultdict(int)

for doc in tqdm(tokenCorpus):
    for token in doc:
        wordfreq[token] += 1

tokenCorpus = [
    [token for token in doc if len(token) > 1]
    for doc in tqdm(tokenCorpus)
]

tokenCorpus = [doc for doc in tqdm(tokenCorpus) if len(doc) > 1]

100%|██████████| 65/65 [00:00<00:00, 4757.94it/s]
100%|██████████| 65/65 [00:00<00:00, 14929.62it/s]
100%|██████████| 65/65 [00:00<00:00, 717446.74it/s]


In [None]:
with open("vef_corpus_chapter.txt", "w") as file:
    for doc in tqdm(tokenCorpus):
        file.write(' '.join(doc) + '\n')

100%|██████████| 65/65 [00:00<00:00, 19196.58it/s]


# Groupby Chapter

In [3]:
chapters = df.groupby('chapter_id')
tokenCorpus = []

for c in tqdm(df.chapter_id.unique()):
    doc = chapters.get_group(c)
    doc = doc.chapter.iloc[0] + ' ' + ' '.join(doc.line)
    doc = doc.lower()
    doc = re.sub(r"[^Ⴀ-ჿⴀ-ⴥᲐ-Ჿ]", ' ', doc)
    doc = re.sub(r"\s+", ' ', doc)
    doc = doc.split(' ')
    doc = [token for token in doc if token != '']
    tokenCorpus.append(doc)

tokenCorpus[0][:10]

100%|██████████| 65/65 [00:00<00:00, 1446.66it/s]


['დასაწყისი',
 'რომელმან',
 'შექმნა',
 'სამყარო',
 'ძალითა',
 'მით',
 'ძლიერითა',
 'ზეგარდმო',
 'არსნი',
 'სულითა']

In [4]:
wordfreq = defaultdict(int)

for doc in tqdm(tokenCorpus):
    for token in doc:
        wordfreq[token] += 1

tokenCorpus = [
    [token for token in doc if len(token) > 1]
    for doc in tqdm(tokenCorpus)
]

tokenCorpus = [doc for doc in tqdm(tokenCorpus) if len(doc) > 1]

100%|██████████| 65/65 [00:00<00:00, 4757.94it/s]
100%|██████████| 65/65 [00:00<00:00, 14929.62it/s]
100%|██████████| 65/65 [00:00<00:00, 717446.74it/s]


In [5]:
with open("vef_corpus_chapter.txt", "w") as file:
    for doc in tqdm(tokenCorpus):
        file.write(' '.join(doc) + '\n')

100%|██████████| 65/65 [00:00<00:00, 19196.58it/s]


# Groupby Strophe

In [6]:
strophes = df.groupby('strophe_id')
tokenCorpus = []

for c in tqdm(df.strophe_id.unique()):
    doc = strophes.get_group(c)
    doc = ' '.join(doc.line)
    doc = doc.lower()
    doc = re.sub(r"[^Ⴀ-ჿⴀ-ⴥᲐ-Ჿ]", ' ', doc)
    doc = re.sub(r"\s+", ' ', doc)
    doc = doc.split(' ')
    doc = [token for token in doc if token != '']
    tokenCorpus.append(doc)

tokenCorpus[0]

100%|██████████| 1669/1669 [00:00<00:00, 6057.42it/s]


['რომელმან',
 'შექმნა',
 'სამყარო',
 'ძალითა',
 'მით',
 'ძლიერითა',
 'ზეგარდმო',
 'არსნი',
 'სულითა',
 'ყვნა',
 'ზეცით',
 'მონაბერითა',
 'ჩვენ',
 'კაცთა',
 'მოგვცა',
 'ქვეყანა',
 'გვაქვს',
 'უთვალავი',
 'ფერითა',
 'და',
 'მისგან',
 'არს',
 'ყოვლი',
 'ხელმწიფე',
 'სახითა',
 'მის',
 'მიერითა']

In [7]:
wordfreq = defaultdict(int)

for doc in tqdm(tokenCorpus):
    for token in doc:
        wordfreq[token] += 1

tokenCorpus = [
    [token for token in doc if len(token) > 1]
    for doc in tqdm(tokenCorpus)
]

tokenCorpus = [doc for doc in tqdm(tokenCorpus) if len(doc) > 1]

100%|██████████| 1669/1669 [00:00<00:00, 181925.03it/s]
100%|██████████| 1669/1669 [00:00<00:00, 319662.70it/s]
100%|██████████| 1669/1669 [00:00<00:00, 4037078.07it/s]


In [8]:
with open("vef_corpus_strophe.txt", "w") as file:
    for doc in tqdm(tokenCorpus):
        file.write(' '.join(doc) + '\n')

100%|██████████| 1669/1669 [00:00<00:00, 421882.32it/s]


# Groupby Line

In [9]:
lines = df
tokenCorpus = []

for i, line in tqdm(df.iterrows()):
    doc = line.line
    doc = doc.lower()
    doc = re.sub(r"[^Ⴀ-ჿⴀ-ⴥᲐ-Ჿ]", ' ', doc)
    doc = re.sub(r"\s+", ' ', doc)
    doc = doc.split(' ')
    doc = [token for token in doc if token != '']
    tokenCorpus.append(doc)

tokenCorpus[0][:10]

6676it [00:00, 22994.77it/s]


['რომელმან', 'შექმნა', 'სამყარო', 'ძალითა', 'მით', 'ძლიერითა']

In [10]:
wordfreq = defaultdict(int)

for doc in tqdm(tokenCorpus):
    for token in doc:
        wordfreq[token] += 1

tokenCorpus = [
    [token for token in doc if len(token) > 1]
    for doc in tqdm(tokenCorpus)
]

tokenCorpus = [doc for doc in tqdm(tokenCorpus) if len(doc) > 1]

100%|██████████| 6676/6676 [00:00<00:00, 605745.11it/s]
100%|██████████| 6676/6676 [00:00<00:00, 894520.45it/s]
100%|██████████| 6676/6676 [00:00<00:00, 4299934.51it/s]


In [11]:
with open("vef_corpus_line.txt", "w") as file:
    for doc in tqdm(tokenCorpus):
        file.write(' '.join(doc) + '\n')

100%|██████████| 6676/6676 [00:00<00:00, 1135581.70it/s]
