In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from stemming.porter2 import stem
import pandas as pd
import numpy as np
import csv
import json
import re

In [None]:
def read_stop_words(file_name):
    stop_words = []
    with open(file_name, 'r', encoding='UTF-8') as f:
        for line in f:
            line = line.replace('\n', '')
            stop_words.append(line)
    return stop_words

In [None]:
def preprocess_text(text, stop_words):
    tokens = re.compile(r'[a-zA-Z0-9]+', re.I).findall(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stem(token) for token in tokens]
    return tokens

In [None]:
def building_token_corpora_dict(file_name, stop_words):
    file = pd.read_csv(file_name, sep='\t', header=None)
    token_corpora_dict = {}
    corpora_tokens = {}
    for index in range(len(file)):
        corpora_name = file.iloc[index][0]
        text = file.iloc[index][1]
        tokens = preprocess_text(text, stop_words)
        if corpora_name not in token_corpora_dict:
            token_corpora_dict[corpora_name] = []
        if corpora_name not in corpora_tokens:
            corpora_tokens[corpora_name] = []
        token_corpora_dict[corpora_name].append(tokens)
    for corpora_name in token_corpora_dict:
        tokens = token_corpora_dict[corpora_name]
        tokens = [token for row in tokens for token in row]
        tokens = list(set(tokens))
        corpora_tokens[corpora_name] = tokens
    return token_corpora_dict, corpora_tokens

In [None]:
def compute_term_mutual_chi_components(term, target_corpus, other_corpus):
    N = len(target_corpus) + len(other_corpus)
    N11 = sum([1 for doc in target_corpus if term in doc])
    N01 = len(target_corpus) - N11
    N10 = sum([1 for doc in other_corpus if term in doc])
    N00 = len(other_corpus) - N10
    N1x, Nx1 = N11 + N10, N11 + N01
    N0x, Nx0 = N00 + N01, N00 + N10
    components = tuple([N, N11, N01, N10, N00, N1x, Nx1, N0x, Nx0])
    return components

In [None]:
def compute_term_mutual_infomation(term, components):
    N, N11, N01, N10, N00, N1x, Nx1, N0x, Nx0 = components
    com1 = np.log2(N*N11 / (N1x*Nx1)) if N*N11 != 0 and N1x*Nx1 != 0 else 0
    com2 = np.log2(N*N01 / (N0x*Nx1)) if N*N01 != 0 and N0x*Nx1 != 0 else 0
    com3 = np.log2(N*N10 / (N1x*Nx0)) if N*N10 != 0 and N1x*Nx0 != 0 else 0
    com4 = np.log2(N*N00 / (N0x*Nx0)) if N*N00 != 0 and N0x*Nx0 != 0 else 0
    mutual_info = (N11/N)*com1 + (N01/N)*com2 + (N10/N)*com3 + (N00/N)*com4
    return mutual_info

In [None]:
def compute_term_chi_square_score(term, components):
    N, N11, N01, N10, N00, N1x, Nx1, N0x, Nx0 = components
    num = Nx1 * N1x * Nx0 * N0x
    chi_square = N * ((N11*N00-N10*N01) ** 2) / num if num != 0 else 0
    return chi_square

In [None]:
def token_analysis_by_mutual_chi(token_corpora_dict, corpora_tokens):
    corpus_names = [['Quran', 'OT', 'NT'], ['OT', 'Quran', 'NT'], ['NT', 'Quran', 'OT']]
    corpus_mutual_chi = {'Quran': {}, 'OT': {}, 'NT': {}}
    for corpus in corpus_names:
        term_mutual_info, term_chi_square = [], []
        target_corpus = token_corpora_dict[corpus[0]]
        other_corpus = token_corpora_dict[corpus[1]] + token_corpora_dict[corpus[2]]
        for term in corpora_tokens[corpus[0]]:
            components = compute_term_mutual_chi_components(term, target_corpus, other_corpus)
            mutual_info = compute_term_mutual_infomation(term, components)
            chi_square = compute_term_chi_square_score(term, components)
            term_mutual_info.append([term, mutual_info])
            term_chi_square.append([term, chi_square])
        term_mutual_info = sorted(term_mutual_info, key=lambda x: x[1], reverse=True)
        term_chi_square = sorted(term_chi_square, key=lambda x: x[1], reverse=True)
        corpus_mutual_chi[corpus[0]]['mutual_info'] = term_mutual_info[:10]
        corpus_mutual_chi[corpus[0]]['chi_square'] = term_chi_square[:10]
    return corpus_mutual_chi

In [None]:
def write_top_10_mutual_chi(corpus_mutual_chi):
    with open('mutual_chi.txt', 'w', encoding='utf-8', newline='') as f:
        headers, contents = [], []
        writer = csv.writer(f)
        for key1 in corpus_mutual_chi:
            for key2 in corpus_mutual_chi[key1]:
                header = key1 + ':' + key2
                headers.append(header)
                content = corpus_mutual_chi[key1][key2]
                content = [str([row[0], round(row[1], 4)]) for row in content]
                contents.append(content)
        contents = np.array(contents).T.tolist()
        writer.writerow(headers)
        writer.writerows(contents)
    return

In [None]:
stop_words = read_stop_words('./englishST1.txt')
token_corpora_dict, corpora_tokens = building_token_corpora_dict('./train_and_dev1.tsv', stop_words)
corpus_mutual_chi = token_analysis_by_mutual_chi(token_corpora_dict, corpora_tokens)

In [None]:
write_top_10_mutual_chi(corpus_mutual_chi)

In [None]:
def compute_top3_topic_and_top10_score(corpora_texts, dictionary, lda):
    all_topic_score_list = []
    for text in corpora_texts:
        bow = dictionary.doc2bow(text)
        all_topic_score_list.append(lda.get_document_topics(bow=bow))
    top3_topic_scores = []
    topic_ids, avg_scores = compute_avg_score_topic_id(all_topic_score_list)
    topic_ids, avg_scores = topic_ids[:3], avg_scores[:3]
    for topic_id, avg_score in zip(topic_ids, avg_scores):
        top3_topic_scores.append([topic_id, avg_score])
    top_10_tokens = []
    for topic_score in top3_topic_scores:
        top_10_tokens.append(lda.show_topic(topic_score[0]))
    return top3_topic_scores, top_10_tokens

In [None]:
def compute_avg_score_topic_id(all_topic_score_list):
    topic_scores = [0] * 20
    for doc_topic_scores in all_topic_score_list:
        for (topic_id, topic_score) in doc_topic_scores:
            topic_scores[topic_id] += topic_score
    avg_scores = np.array(topic_scores) / len(all_topic_score_list)
    topic_ids = np.argsort(avg_scores)[::-1].tolist()
    avg_scores = avg_scores[topic_ids].tolist()
    return topic_ids, avg_scores

In [None]:
def lda_topic_analysis(token_corpora_dict):
    quran_texts = token_corpora_dict['Quran']
    ot_texts = token_corpora_dict['OT']
    nt_texts = token_corpora_dict['NT']
    texts = quran_texts + ot_texts + nt_texts
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(corpus, id2word=dictionary, num_topics=20, random_state=1000)
    quran_topic_scores, quran_tokens = compute_top3_topic_and_top10_score(
        quran_texts, dictionary, lda)
    ot_topic_scores, ot_tokens = compute_top3_topic_and_top10_score(
        ot_texts, dictionary, lda)
    nt_topic_scores, nt_tokens = compute_top3_topic_and_top10_score(
        nt_texts, dictionary, lda)
    results = {'quran_topic_scores': quran_topic_scores,
               'quran_tokens': quran_tokens,
               'ot_topic_scores': ot_topic_scores,
               'ot_tokens': ot_tokens,
               'nt_topic_scores': nt_topic_scores,
               'nt_tokens': nt_tokens}
    return results

In [None]:
def write_top3_topics_and_top10_tokens(quran_topic_scores, quran_tokens,ot_topic_scores,
                                       ot_tokens, nt_topic_scores, nt_tokens):
    with open('lda_results.txt', 'w', encoding='utf-8') as f:
        f.write("Quran top 3 topic scores and top 10 tokens for each topic\n")
        for index, topic_score in enumerate(quran_topic_scores):
            f.write(str(topic_score) + "\n")
            f.write(str(quran_tokens[index]) + "\n")
        f.write("OT top 3 topic scores and top 10 tokens for each topic\n")
        for index, topic_score in enumerate(ot_topic_scores):
            f.write(str(topic_score) + "\n")
            f.write(str(ot_tokens[index]) + "\n")
        f.write("NT top 3 topic scores and top 10 tokens for each topic\n")
        for index, topic_score in enumerate(nt_topic_scores):
            f.write(str(topic_score) + "\n")
            f.write(str(nt_tokens[index]) + "\n")
    return

In [None]:
stop_words = read_stop_words('./englishST.txt')
token_corpora_dict, corpora_tokens = building_token_corpora_dict('./train_and_dev.tsv', stop_words)

In [None]:
results = lda_topic_analysis(token_corpora_dict)

In [None]:
write_top3_topics_and_top10_tokens(**results)