In [17]:
import re
import pandas as pd

In [18]:
# remove special characters
def preprocess(token: str):
    # remove html tags
    token = re.sub(r'<.*?>', '', token)
    token = re.sub(r'&[a-z]+;', '', token)
    # remove new lines
    token = re.sub(r"\n", " ", token)
    token = re.sub(r'\s{2,}', ' ', token)
    # remove special characters
    token = re.sub(r'[^a-zA-Z0-9\s-]', '', token)
    # double spaces
    token = re.sub(' +', ' ', token)
    # lower
    token = token.lower()
    return token

def contains_number(inputString):
    return bool(re.search(r'\d', inputString))

def readFile(file:str):
    lines = []
    with open(file, encoding="cp1252") as f:
        for line in f:
            if line != "\n":
                lines.append(line.strip().lower())
    return lines

def getWordCounts(lines):
    total_words = 0
    word_count = {}
    for line in lines:
        for word in line.split(" "):
            if contains_number(word) or len(word) == 0:
                continue
            word = preprocess(word)
            total_words += 1
            if word not in word_count:
                word_count[word] = 1
            elif word in word_count:
                word_count[word] += 1
    return {
        "word_counts": sorted(word_count.items(), key=lambda x:x[1])[::-1],
        "total_words": total_words
    }

In [19]:
dataset = {
    "NEWS": {
        "dir": "./news",
        "corpora_path": "./news/news.txt",
        "name": "news"
    },
    "BIBLE": {
        "dir": "./bible",
        "corpora_path": "./bible/preprocessed/preprocessed_text.txt",
        "name": "bible"
    },
    "WIKI_TL": {
        "dir": "./wiki_tl",
        "corpora_path": "./wiki_tl/preprocessed_wiki_tl.txt",
        "name": "wiki_tagalog"
    },
    "HISTORICAL": {
        "dir": "./historical",
        "corpora_path": "./historical/hist-preprocessed.txt",
        "name": "historical"
    },
    "SONGS": {
        "dir": "./songs",
        "corpora_path": "./songs/preprocessed.txt",
        "name": "songs"
    }
}

In [20]:
corpora_lines = []
for key in dataset.keys():
    # corpus level
    corpus = dataset[key]
    lines = readFile(corpus["corpora_path"])
    corpora_lines.extend(lines)
    wordcount_data = getWordCounts(lines)
    wordcounts = wordcount_data["word_counts"]
    total_words = wordcount_data["total_words"]
    print(f"{key} = total:{total_words} unique:{len(wordcounts)}")

    df = pd.DataFrame(wordcounts, columns =['words', 'count'])
    df.to_csv("wordcounts" + "/" +corpus["name"] + "_wordcounts.csv", index=0)

NEWS = total:49816 unique:6868
BIBLE = total:842952 unique:24526
WIKI_TL = total:51717703 unique:239304
HISTORICAL = total:21629 unique:3973
SONGS = total:74968 unique:3175


In [21]:
# corpora level
wordcount_data = getWordCounts(corpora_lines)
wordcounts = wordcount_data["word_counts"]
total_words = wordcount_data["total_words"]

print(f"CORPORA = total:{total_words} unique:{len(wordcounts)}")
master_df = pd.DataFrame(wordcounts, columns =['words', 'count'])
master_df.to_csv("master_wordcounts.csv", index=0)

CORPORA = total:52707068 unique:254678
