In [1]:
import re
import pandas as pd

In [6]:
def contains_number(inputString):
    return bool(re.search(r'\d', inputString))

def readFile(file:str):
    lines = []
    with open(file) as f:
        for line in f:
            if line != "\n":
                lines.append(line.strip().lower())
    return lines

def getWordCounts(lines):
    total_words = 0
    word_count = {}
    for line in lines:
        for word in line.split(" "):
            if contains_number(word) or len(word) == 0:
                continue
            word = word.lower()
            total_words += 1
            if word not in word_count:
                word_count[word] = 1
            elif word in word_count:
                word_count[word] += 1
    return {
        "word_counts": sorted(word_count.items(), key=lambda x:x[1])[::-1],
        "total_words": total_words
    }

In [3]:
dataset = {
    "NEWS": {
        "dir": "./news",
        "corpora_path": "./news/news.txt",
        "name": "news"
    },
    "BIBLE": {
        "dir": "./bible",
        "corpora_path": "./bible/preprocessed/preprocessed_text.txt",
        "name": "bible"
    },
    "WIKI_TL": {
        "dir": "./wiki_tl",
        "corpora_path": "./wiki_tl/preprocessed_wiki_tl.txt",
        "name": "wiki_tagalog"
    },
    "HISTORICAL": {
        "dir": "./historical",
        "corpora_path": "./historical/hist-preprocessed.txt",
        "name": "historical"
    },
    "SONGS": {
        "dir": "./songs",
        "corpora_path": "./songs/preprocessed.txt",
        "name": "songs"
    }
}

In [4]:
corpora_lines = []
for key in dataset.keys():
    # corpus level
    corpus = dataset[key]
    lines = readFile(corpus["corpora_path"])
    corpora_lines.extend(lines)
    wordcount_data = getWordCounts(lines)
    wordcounts = wordcount_data["word_counts"]
    total_words = wordcount_data["total_words"]
    print(f"{key} = total:{total_words} unique:{len(wordcounts)}")

    df = pd.DataFrame(wordcounts, columns =['words', 'count'])
    df.to_csv("wordcounts" + "/" +corpus["name"] + "_wordcounts.csv", index=0)

NEWS = total:50022 unique:6869
BIBLE = total:842952 unique:24599
WIKI_TL = total:51717709 unique:240130
HISTORICAL = total:21629 unique:3976
SONGS = total:77786 unique:3550


In [5]:
# corpora level
wordcount_data = getWordCounts(corpora_lines)
wordcounts = wordcount_data["word_counts"]
total_words = wordcount_data["total_words"]

print(f"CORPORA = total:{total_words} unique:{len(wordcounts)}")
master_df = pd.DataFrame(wordcounts, columns =['words', 'count'])
master_df.to_csv("master_wordcounts.csv", index=0)

CORPORA = total:52710098 unique:256119
