# Import Package

In [1]:
import sklearn
import collections

# regular expressions
import re
# for string.punctuation: list of punctuation characters
import string
from stemming.porter2 import stem
from collections import Counter

# import this for storing our BOW format
import scipy
from scipy import sparse
# scikit learn. Contains lots of ML models we can use
# import the library for support vector machines
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer


import numpy as np
import csv

# from gensim.test.utils import common_texts
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

# 1. IR Evaluation

In [2]:
# read qrels.csv and system_results.csv
qrels_mat = np.loadtxt(open("qrels.csv","rb"), delimiter=",", skiprows=1)
system_results_mat = np.loadtxt(open("system_results.csv","rb"), delimiter=",", skiprows=1)

In [3]:
system_number = 6
query_number = 10
system_number_index = np.array([x[0] for x in system_results_mat])
system_query_index = np.array([x[1] for x in system_results_mat])
query_id_index = np.array([x[0] for x in qrels_mat])
doc_id_index = np.array([x[1] for x in qrels_mat])

query_rel_doc = []
doc_rel_dict = {}
# store relevant doc_id for every query
for i in range(query_number):
    doc_array = qrels_mat[query_id_index==i+1]
    doc_list = [x[1] for x in doc_array]
    query_rel_doc.extend([doc_list])
    
    rel_dict = {}
    for j in range(len(doc_array)):
        rel_dict[doc_array[j][1]] = doc_array[j][2]
    
    doc_rel_dict[i+1] = rel_dict

# doc_rel_dict

In [64]:
# doc_rel_dict

{1: {9090.0: 3.0,
  6850.0: 2.0,
  9574.0: 2.0,
  8709.0: 1.0,
  9684.0: 1.0,
  5011.0: 1.0},
 2: {5715.0: 2.0,
  9677.0: 2.0,
  5766.0: 2.0,
  6327.0: 1.0,
  6079.0: 1.0,
  5653.0: 1.0,
  6498.0: 1.0,
  7117.0: 1.0},
 3: {9743.0: 3.0},
 4: {6491.0: 3.0,
  5269.0: 3.0,
  8032.0: 3.0,
  9444.0: 3.0,
  8988.0: 2.0,
  9445.0: 2.0,
  5883.0: 2.0,
  7435.0: 2.0,
  9745.0: 1.0,
  10029.0: 1.0,
  7224.0: 1.0,
  9038.0: 1.0,
  7827.0: 1.0,
  6675.0: 1.0,
  9720.0: 1.0,
  6289.0: 1.0,
  9746.0: 1.0,
  6836.0: 1.0,
  10119.0: 1.0,
  4742.0: 1.0,
  9739.0: 1.0,
  5783.0: 1.0,
  10117.0: 1.0,
  8414.0: 1.0,
  5865.0: 1.0,
  8315.0: 1.0,
  9523.0: 1.0,
  8318.0: 1.0,
  6288.0: 1.0,
  5268.0: 1.0,
  7620.0: 1.0,
  7046.0: 1.0,
  6054.0: 1.0,
  9744.0: 1.0,
  6743.0: 1.0,
  9278.0: 1.0,
  8562.0: 1.0,
  6382.0: 1.0,
  6334.0: 1.0,
  6292.0: 1.0},
 5: {1646.0: 1.0,
  2126.0: 1.0,
  3111.0: 1.0,
  4983.0: 1.0,
  8646.0: 1.0,
  6669.0: 1.0,
  8282.0: 1.0},
 6: {8433.0: 3.0,
  7487.0: 3.0,
  6736.0: 3.0,

In [4]:
# get P@10 for each system and each query

num_traverse = 10
p10_list = []
# traverse 6 system
for i in range(system_number):
    # get the array of system for every system 0-6
    system_array = system_results_mat[system_number_index==i+1]
    query_index = np.array([x[1] for x in system_array])
    
    p10_query = []
    # traverse 10 query
    for j in range(query_number):
        # get the array of query for every query 0-10
        query_array = system_array[query_index==j+1]
        query_10 = query_array[:10]
        doc_query_10 = [x[2] for x in query_10]
       
        num_rel = 0
        for doc in doc_query_10:
            if doc in query_rel_doc[j]:
                num_rel += 1
                
        p10_query.extend([num_rel/num_traverse])
            
    p10_list.extend([p10_query])

# p10_list

In [5]:
# get R@50 for each system and each query

r50_list = []
# traverse 6 system
for i in range(system_number):
    # get the array of system for every system 0-6
    system_array = system_results_mat[system_number_index==i+1]
    query_index = np.array([x[1] for x in system_array])
    
    r50_query = []
    # traverse 10 query
    for j in range(query_number):
        # get the array of query for every query 0-10
        query_array = system_array[query_index==j+1]
        query_50 = query_array[:50]
        doc_query_50 = [x[2] for x in query_50]
       
        num_relevant = len(query_rel_doc[j])  # store the num of relevant documents in recall
        num_rel = 0
        for doc in doc_query_50:
            if doc in query_rel_doc[j]:
                num_rel += 1
                
        r50_query.extend([num_rel/num_relevant])
            
    r50_list.extend([r50_query])

# r50_list

In [6]:
# get r-precision for each system and each query

rp_list = []
# traverse 6 system
for i in range(system_number):
    # get the array of system for every system 0-6
    system_array = system_results_mat[system_number_index==i+1]
    query_index = np.array([x[1] for x in system_array])
    
    rp_query = []
    # traverse 10 query
    for j in range(query_number):
        # get the array of query for every query 0-10
        num_relevant = len(query_rel_doc[j])  # store the num of relevant documents in recall
    
        query_array = system_array[query_index==j+1]
        query_n = query_array[:num_relevant]
        doc_query_n = [x[2] for x in query_n]
       
        num_rel = 0
        for doc in doc_query_n:
            if doc in query_rel_doc[j]:
                num_rel += 1
                
        rp_query.extend([num_rel/num_relevant])
            
    rp_list.extend([rp_query])

# rp_list

In [7]:
# get AP for each system and each query

ap_list = []
# traverse 6 system
for i in range(system_number):
    # get the array of system for every system 0-6
    system_array = system_results_mat[system_number_index==i+1]
    query_index = np.array([x[1] for x in system_array])
    
    ap_query = []
    # traverse 10 query
    for j in range(query_number):
        # get the array of query for every query 0-10   
        query_array = system_array[query_index==j+1]
        doc_query_n = [x[2] for x in query_array]
       
        ap = 0
        rel = 0
        traverse = 0
        # traverse all documents for each query
        for k in range(len(doc_query_n)):
            traverse += 1
            if doc_query_n[k] in query_rel_doc[j]:
                rel += 1
                ap += rel / traverse
        
        ap /= len(query_rel_doc[j])
                          
        ap_query.extend([ap])
            
    ap_list.extend([ap_query])

# ap_list

In [8]:
# get nDCG@10 for each system and each query

ndcg10_list = []
n = 10
ig_lists = []

# generate iG list
for i in range(query_number):
    doc_array = qrels_mat[query_id_index==i+1]
    ig_list = [x[2] for x in doc_array]
    
    if len(ig_list) < n:
        for j in range(n-len(ig_list)):
            ig_list.extend([0])
    else:
        ig_list = ig_list[:n]
    
    ig_lists.extend([ig_list])
    
# traverse 6 system
for i in range(system_number):
    # get the array of system for every system 0-6
    system_array = system_results_mat[system_number_index==i+1]
    query_index = np.array([x[1] for x in system_array])
    
    ndcg10_query = []
    # traverse 10 query
    for j in range(query_number):
        # get the array of query for every query 0-10   
        query_array = system_array[query_index==j+1]
        query_n = query_array[:n]
        doc_query_n = [x[2] for x in query_n]
       
        g_list = []
        # get G list 
        for k in range(len(doc_query_n)):
            if doc_query_n[k] in query_rel_doc[j]:
                g_list.extend([doc_rel_dict[j+1][doc_query_n[k]]])
            else:
                g_list.extend([0])
        
        # calculate DCG
        dcg = g_list[0]
        for p in range(len(g_list)-1):
            a = g_list[p + 1] / np.log2(p + 2)
            dcg += a
            
        # calculate iDCG  
        ig_list = ig_lists[j]
        idcg = ig_list[0]
        for q in range(len(ig_list)-1):
            b = ig_list[q + 1] / np.log2(q + 2)
            idcg += b
        
        if idcg == 0 :                    
            ndcg10_query.extend([0])
        else:
            ndcg10_query.extend([dcg / idcg])
            
    ndcg10_list.extend([ndcg10_query])

# ndcg10_list

In [9]:
# get nDCG@20 for each system and each query

ndcg20_list = []
n = 20
ig_lists = []

# generate iG list
for i in range(query_number):
    doc_array = qrels_mat[query_id_index==i+1]
    ig_list = [x[2] for x in doc_array]
    
    if len(ig_list) < n:
        for j in range(n-len(ig_list)):
            ig_list.extend([0])
    else:
        ig_list = ig_list[:n]
    
    ig_lists.extend([ig_list])
    
# traverse 6 system
for i in range(system_number):
    # get the array of system for every system 0-6
    system_array = system_results_mat[system_number_index==i+1]
    query_index = np.array([x[1] for x in system_array])
    
    ndcg20_query = []
    # traverse 10 query
    for j in range(query_number):
        # get the array of query for every query 0-10   
        query_array = system_array[query_index==j+1]
        query_n = query_array[:n]
        doc_query_n = [x[2] for x in query_n]
       
        g_list = []
        # get G list 
        for k in range(len(doc_query_n)):
            if doc_query_n[k] in query_rel_doc[j]:
                g_list.extend([doc_rel_dict[j+1][doc_query_n[k]]])
            else:
                g_list.extend([0])
        
        # calculate DCG
        dcg = g_list[0]
        for p in range(len(g_list)-1):
            a = g_list[p + 1] / np.log2(p + 2)
            dcg += a
            
        # calculate iDCG  
        ig_list = ig_lists[j]
        idcg = ig_list[0]
        for q in range(len(ig_list)-1):
            b = ig_list[q + 1] / np.log2(q + 2)
            idcg += b
        
        if idcg == 0 :                    
            ndcg20_query.extend([0])
        else:
            ndcg20_query.extend([dcg / idcg])
            
    ndcg20_list.extend([ndcg20_query])

# ndcg20_list

In [10]:
# generate p10_mean_list r50_mean_list rp_mean_list ap_mean_list ndcg10_mean_list ndcg20_mean_list

p10_mean_list = []; r50_mean_list = []; rp_mean_list = [] 
ap_mean_list = []; ndcg10_mean_list = []; ndcg20_mean_list = []

for i in range(6):
    p10_mean_list.extend([np.mean(p10_list[i])])
    r50_mean_list.extend([np.mean(r50_list[i])])
    rp_mean_list.extend([np.mean(rp_list[i])])
    ap_mean_list.extend([np.mean(ap_list[i])])
    ndcg10_mean_list.extend([np.mean(ndcg10_list[i])])
    ndcg20_mean_list.extend([np.mean(ndcg20_list[i])])

In [11]:
p10_mean_list

[0.39, 0.22000000000000003, 0.41, 0.08, 0.41, 0.41]

### Select the best system (p-value)

In [12]:
# for p10:  best: 3 second: 6
t_p10, p_p10 = scipy.stats.ttest_ind(p10_list[2], p10_list[5])
# for r50:  best: 2 second: 1
t_r50, p_r50 = scipy.stats.ttest_ind(r50_list[1], r50_list[0])
# for r-precision:  best: 3 second: 6
t_rp, p_rp = scipy.stats.ttest_ind(rp_list[2], rp_list[5])
# for AP:  best: 3 second: 6
t_ap, p_ap = scipy.stats.ttest_ind(ap_list[2], ap_list[5])
# for nDCG@10:  best: 3 second: 6
t_ndcg10, p_ndcg10 = scipy.stats.ttest_ind(ndcg10_list[2], ndcg10_list[5])
# for nDCG@20:  best: 3 second: 6
t_ndcg20, p_ndcg20 = scipy.stats.ttest_ind(ndcg20_list[2], ndcg20_list[5])

In [13]:
print('t-statistic of p10: ' + str(t_p10) + '   ' + 'p-value of p10: ' + str(p_p10))
print('t-statistic of r50: ' + str(t_r50) + '   ' + 'p-value of r50: ' + str(p_r50))
print('t-statistic of rp: ' + str(t_rp) + '   ' + 'p-value of rp: ' + str(p_rp))
print('t-statistic of ap: ' + str(t_ap) + '   ' + 'p-value of ap: ' + str(p_ap))
print('t-statistic of ndcg10: ' + str(t_ndcg10) + '   ' + 'p-value of ndcg10: ' + str(p_ndcg10))
print('t-statistic of ndcg20: ' + str(t_ndcg20) + '   ' + 'p-value of ndcg20: ' + str(p_ndcg20))

t-statistic of p10: 0.0   p-value of p10: 1.0
t-statistic of r50: 0.3880082704800308   p-value of r50: 0.7025603945402291
t-statistic of rp: 0.0   p-value of rp: 1.0
t-statistic of ap: 0.042513564801249584   p-value of ap: 0.9665573458870693
t-statistic of ndcg10: 0.15065127811699436   p-value of ndcg10: 0.8819262156757446
t-statistic of ndcg20: 0.16786186996140187   p-value of ndcg20: 0.8685635746981927


## Generate ir_eval.csv

In [61]:
f = open('ir_eval.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(f)
csv_writer.writerow(['system_number', 'query_number', 'P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20'])

# traverse 6 systems
for i in range(6):
    # traverse 10 queries
    for j in range(10):
        p_10 = round(p10_list[i][j], 3)
        r_50 = round(r50_list[i][j], 3)
        rp = round(rp_list[i][j], 3)
        ap = round(ap_list[i][j], 3)
        ndcg10 = round(ndcg10_list[i][j], 3)
        ndcg20 = round(ndcg20_list[i][j], 3)
        csv_writer.writerow([i+1, j+1, p_10, r_50, rp, ap, ndcg10, ndcg20])
    # write mean
    p10_mean = round(p10_mean_list[i], 3)
    r50_mean = round(r50_mean_list[i], 3)
    rp_mean = round(rp_mean_list[i], 3)
    ap_mean = round(ap_mean_list[i], 3)
    ndcg10_mean = round(ndcg10_mean_list[i], 3)
    ndcg20_mean = round(ndcg20_mean_list[i], 3)
    csv_writer.writerow([i+1, 'mean', p10_mean, r50_mean, rp_mean, ap_mean, ndcg10_mean, ndcg20_mean])


# 2. Text Analysis

## Preparation

In [15]:
# generate OT NT Quran corpus

with open('train_and_dev.tsv', 'r') as f:
    data = f.readlines()

data_list = []
for d in data:
    a = d.rstrip('\n').split('\t')
    data_list.extend([a])
    
# get ot_list nt_list quran_list
ot_list = []; nt_list = []; quran_list = []
for i in range(len(data_list)):
    if data_list[i][0] == 'OT':
        ot_list.extend([data_list[i][1]])
    elif data_list[i][0] == 'NT':
        nt_list.extend([data_list[i][1]])
    elif data_list[i][0] == 'Quran':
        quran_list.extend([data_list[i][1]])        

In [16]:
print('length of OT: ' + str(len(ot_list)))
print('length of NT: ' + str(len(nt_list)))
print('length of Quran: ' + str(len(quran_list)))

length of OT: 20766
length of NT: 7112
length of Quran: 5616


In [17]:
# delete punctuation
# re.sub(pattern, repl, string, count=0, flags=0)
# put words in englishST into a list
with open('englishST.txt', 'r') as eng:
    eng_str = eng.read()
eng_list = eng_str.split()
    
r = """[0-9!#$%&'"()*+,-./:;\\\<=>?@[\]^_`{|}~\n]"""

def preprocessing(s):
    # delete punctuation
    no_punct = re.sub(r, ' ', s)

    # transform to lower_case
    no_punct = no_punct.lower()

    # put string to words list
    no_list = no_punct.split()

    # delete stopping words(englishST.txt) in no_list
    stop_list = []
    for i in no_list:
        if (i not in eng_list):
            stop_list.extend([i])

    # stemming
    norm_list = []
    for i in stop_list:
        norm_list.append(stem(i))
    
    return norm_list

In [18]:
%%time
# get stemming word_list of OT, NT and Quran corpus
# every list in corpus represents a document
ot_pre_list = []; nt_pre_list = []; quran_pre_list = []

for i in range(len(ot_list)):
    line = preprocessing(ot_list[i])
    if (len(line) > 0):
        ot_pre_list.extend([line])

for j in range(len(nt_list)):
    line = preprocessing(nt_list[j])
    if (len(line) > 0):
        nt_pre_list.extend([line])

for k in range(len(quran_list)):
    line = preprocessing(quran_list[k])
    if (len(line) > 0):
        quran_pre_list.extend([line])

CPU times: user 12.5 s, sys: 123 ms, total: 12.6 s
Wall time: 13.6 s


In [19]:
print('length of ot_pre_list: ' + str(len(ot_pre_list)))
print('length of nt_pre_list: ' + str(len(nt_pre_list)))
print('length of quran_pre_list: ' + str(len(quran_pre_list)))

length of ot_pre_list: 20765
length of nt_pre_list: 7100
length of quran_pre_list: 5606


In [20]:
# generate all list
ot_all_list = []; nt_all_list = []; quran_all_list = []

for i in range(len(ot_pre_list)):
    ot_all_list.extend(ot_pre_list[i])
for j in range(len(nt_pre_list)):
    nt_all_list.extend(nt_pre_list[j])
for k in range(len(quran_pre_list)):
    quran_all_list.extend(quran_pre_list[k])

In [21]:
# get corpus dictionary and its key_list

# opeartion for OT
ot_counter = Counter(ot_all_list)
ot_dict = dict(ot_counter)
# operation for NT
nt_counter = Counter(nt_all_list)
nt_dict = dict(nt_counter)
# operation for Quran
quran_counter = Counter(quran_all_list)
quran_dict = dict(quran_counter)

# delete when value < 10
list1 = []; list2 = []
for key, value in ot_dict.items():
    a = 0; b = 0
    if key in nt_dict.keys():
        a = nt_dict[key]
    if key in quran_dict.keys():
        b = quran_dict[key]    
    if a + b + value >= 10:
        list1.append(key)
        list2.append(value)
         
ot_dict = dict(zip(list1,list2))
ot_key = list(ot_dict.keys())

# delete when value < 10
list1 = []; list2 = []
for key, value in nt_dict.items():
    a = 0; b = 0
    if key in ot_dict.keys():
        a = ot_dict[key]
    if key in quran_dict.keys():
        b = quran_dict[key]    
    if a + b + value >= 10:
        list1.append(key)
        list2.append(value)
         
nt_dict = dict(zip(list1,list2))
nt_key = list(nt_dict.keys())

# delete when value < 10
list1 = []; list2 = []
for key, value in quran_dict.items():
    a = 0; b = 0
    if key in ot_dict.keys():
        a = ot_dict[key]
    if key in nt_dict.keys():
        b = nt_dict[key]    
    if a + b + value >= 10:
        list1.append(key)
        list2.append(value)
         
quran_dict = dict(zip(list1,list2))
quran_key = list(quran_dict.keys())

In [22]:
# special word list in OT, NT and Quran
special_list = list(set(ot_key).union(set(nt_key)).union(set(quran_key)))
print('length of ot_key: ' + str(len(ot_key)))
print('length of nt_key: ' + str(len(nt_key)))
print('length of quran_key: ' + str(len(quran_key)))
print('length of special_list: ' + str(len(special_list)))

length of ot_key: 2644
length of nt_key: 2101
length of quran_key: 1771
length of special_list: 2823


In [23]:
'israel' in ot_key

True

In [24]:
%%time
# calculate the number of times every word appears in the document in each corpus

# for OT
num_ot_dict = {}
for i in range(len(ot_key)):
    num_ot_dict[ot_key[i]] = 0   
for j in range(len(ot_pre_list)):
    w_list = list(np.unique(ot_pre_list[j]))
    for w in w_list:
        if w in ot_key:
            num_ot_dict[w] += 1

# for NT
num_nt_dict = {}
for i in range(len(nt_key)):
    num_nt_dict[nt_key[i]] = 0   
for j in range(len(nt_pre_list)):
    w_list = list(np.unique(nt_pre_list[j]))
    for w in w_list:
        if w in nt_key:
            num_nt_dict[w] += 1

# for Quran
num_quran_dict = {}
for i in range(len(quran_key)):
    num_quran_dict[quran_key[i]] = 0   
for j in range(len(quran_pre_list)):
    w_list = list(np.unique(quran_pre_list[j]))
    for w in w_list:
        if w in quran_key:
            num_quran_dict[w] += 1

CPU times: user 4.52 s, sys: 22.9 ms, total: 4.55 s
Wall time: 4.91 s


## Compute Mutual Information

In [25]:
# compute mutual information
def takeSecond(elem):
    return elem[1]

N_ot = len(ot_pre_list); N_nt = len(nt_pre_list); N_quran = len(quran_pre_list)
N = N_ot + N_nt + N_quran

In [26]:
# calaculate N N0 N1 N11 N10 N01 N00 for OT

mi_ot_list = []; X2_ot_list = []
N1 = N_ot  # ot
N0 = N - N_ot  # not ot

for w in special_list:
    N11 = 0
    if w in ot_key:  # in ot
        N11 = num_ot_dict[w]          
    N01 = N1 - N11
       
    N10 = 0
    if w in nt_key:  # in nt
        N10 += num_nt_dict[w] 
    if w in quran_key:  # in quran
        N10 += num_quran_dict[w]     
    N00 = N0 - N10
    
    a = 0; b = 0; c = 0; d = 0
    if N11 != 0:
        a = (N11 / N) * np.log2((N * N11)/((N10+N11) * N1))
    if N01 != 0:
        b = (N01 / N) * np.log2((N * N01)/((N00+N01) * N1))
    if N10 != 0:
        c = (N10 / N) * np.log2((N * N10)/((N10+N11) * N0))
    if N00 != 0:
        d = (N00 / N) * np.log2((N * N00)/((N00+N01) * N0))
       
    mi = a + b + c + d
    X2 = ((N11+N10+N01+N00)*(N11*N00-N10*N01)**2) / ((N11+N01)*(N11+N10)*(N10+N00)*(N01+N00))
     
    mi_ot_list.extend([[w, mi]])
    X2_ot_list.extend([[w, X2]])
    
mi_ot_list.sort(key=takeSecond, reverse=True)
X2_ot_list.sort(key=takeSecond, reverse=True)

In [27]:
# calaculate N N0 N1 N11 N10 N01 N00 for NT

mi_nt_list = []; X2_nt_list = []
N1 = N_nt  # nt
N0 = N - N_nt  # not nt

for w in special_list:
    N11 = 0
    if w in nt_key:  # in nt
        N11 = num_nt_dict[w]          
    N01 = N1 - N11
       
    N10 = 0
    if w in ot_key:  # in ot
        N10 += num_ot_dict[w] 
    if w in quran_key:  # in quran
        N10 += num_quran_dict[w]     
    N00 = N0 - N10
    
    a = 0; b = 0; c = 0; d = 0
    if N11 != 0:
        a = (N11 / N) * np.log2((N * N11)/((N10+N11) * N1))
    if N01 != 0:
        b = (N01 / N) * np.log2((N * N01)/((N00+N01) * N1))
    if N10 != 0:
        c = (N10 / N) * np.log2((N * N10)/((N10+N11) * N0))
    if N00 != 0:
        d = (N00 / N) * np.log2((N * N00)/((N00+N01) * N0))
       
    mi = a + b + c + d
    X2 = ((N11+N10+N01+N00)*(N11*N00-N10*N01)**2) / ((N11+N01)*(N11+N10)*(N10+N00)*(N01+N00))
     
    mi_nt_list.extend([[w, mi]])
    X2_nt_list.extend([[w, X2]])
    
mi_nt_list.sort(key=takeSecond, reverse=True)
X2_nt_list.sort(key=takeSecond, reverse=True)

In [28]:
# calaculate N N0 N1 N11 N10 N01 N00 for Quran

mi_quran_list = []; X2_quran_list = []
N1 = N_quran  # quran
N0 = N - N_quran  # not quran

for w in special_list:
    N11 = 0
    if w in quran_key:  # in quran
        N11 = num_quran_dict[w]          
    N01 = N1 - N11
       
    N10 = 0
    if w in ot_key:  # in ot
        N10 += num_ot_dict[w] 
    if w in nt_key:  # in nt
        N10 += num_nt_dict[w]     
    N00 = N0 - N10
    
    a = 0; b = 0; c = 0; d = 0
    if N11 != 0:
        a = (N11 / N) * np.log2((N * N11)/((N10+N11) * N1))
    if N01 != 0:
        b = (N01 / N) * np.log2((N * N01)/((N00+N01) * N1))
    if N10 != 0:
        c = (N10 / N) * np.log2((N * N10)/((N10+N11) * N0))
    if N00 != 0:
        d = (N00 / N) * np.log2((N * N00)/((N00+N01) * N0))
       
    mi = a + b + c + d
    X2 = ((N11+N10+N01+N00)*(N11*N00-N10*N01)**2) / ((N11+N01)*(N11+N10)*(N10+N00)*(N01+N00))
     
    mi_quran_list.extend([[w, mi]])
    X2_quran_list.extend([[w, X2]])
    
mi_quran_list.sort(key=takeSecond, reverse=True)
X2_quran_list.sort(key=takeSecond, reverse=True)

## Output MI and X2

In [31]:
mi_quran_list[:10]

[['god', 0.03153127928829265],
 ['muhammad', 0.028866958854356823],
 ['believ', 0.019710986535567396],
 ['torment', 0.019665532570955824],
 ['messeng', 0.015592852710699903],
 ['revel', 0.01390536208881128],
 ['king', 0.013097948078937837],
 ['israel', 0.013023790650338828],
 ['unbeliev', 0.012542024266425358],
 ['guidanc', 0.012242290887039623]]

## TOPIC-LEVEL COMPARISONS (LDA)

In [32]:
%%time
# Create a corpus from a list of texts
common_texts = ot_pre_list.copy()
common_texts.extend(nt_pre_list)
common_texts.extend(quran_pre_list)

common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# Train the model on the corpus
lda = LdaModel(common_corpus, num_topics=20, id2word=common_dictionary, random_state=0)

CPU times: user 7.08 s, sys: 34.5 ms, total: 7.11 s
Wall time: 7.23 s


In [33]:
# lda.print_topics()

In [34]:
# get topic score for ot_pre_list:
ot_score_list = []
for i in range(len(ot_pre_list)):
    doc = ot_pre_list[i]
    doc_bow = common_dictionary.doc2bow(doc)
    ot_score_list.extend([lda.get_document_topics(doc_bow, minimum_probability=0)])

# get topic score for nt_pre_list:
nt_score_list = []
for j in range(len(nt_pre_list)):
    doc = nt_pre_list[j]
    doc_bow = common_dictionary.doc2bow(doc)
    nt_score_list.extend([lda.get_document_topics(doc_bow, minimum_probability=0)])

# get topic score for quran_pre_list:
quran_score_list = []
for k in range(len(quran_pre_list)):
    doc = quran_pre_list[k]
    doc_bow = common_dictionary.doc2bow(doc)
    quran_score_list.extend([lda.get_document_topics(doc_bow, minimum_probability=0)])

In [35]:
# get topic average score for ot_pre_list, nt_avg_list and quran_avg_list
num_topic = 20
ot_avg_list = []; nt_avg_list = []; quran_avg_list = [] 

for i in range(num_topic):
    # for ot_avg_list
    ot_score = [x[i][1] for x in ot_score_list]
    ot_avg_score = sum(ot_score) / len(ot_score)
    ot_avg_list.extend([(i, ot_avg_score)])
    # for nt_avg_list
    nt_score = [x[i][1] for x in nt_score_list]
    nt_avg_score = sum(nt_score) / len(nt_score)
    nt_avg_list.extend([(i, nt_avg_score)])
    # for quran_avg_list
    quran_score = [x[i][1] for x in quran_score_list]
    quran_avg_score = sum(quran_score) / len(quran_score)
    quran_avg_list.extend([(i, quran_avg_score)])
    
ot_avg_list.sort(key=takeSecond, reverse=True)
nt_avg_list.sort(key=takeSecond, reverse=True)
quran_avg_list.sort(key=takeSecond, reverse=True)

In [36]:
print('Top 3 topics in OT: ' + str(ot_avg_list[:3]))
print('Top 3 topics in NT: ' + str(nt_avg_list[:3]))
print('Top 3 topics in Quran: ' + str(quran_avg_list[:3]))

Top 3 topics in OT: [(18, 0.07698535544350618), (11, 0.06747042924715593), (12, 0.05813475964167517)]
Top 3 topics in NT: [(13, 0.08209313911064463), (16, 0.06848463969026417), (18, 0.06589683862737525)]
Top 3 topics in Quran: [(11, 0.1269977275438763), (18, 0.09698604104876876), (13, 0.09211305664083747)]


In [37]:
print(lda.print_topic(18))
print(lda.print_topic(11))
print(lda.print_topic(12))

0.253*"god" + 0.062*"lord" + 0.052*"judgment" + 0.037*"peopl" + 0.029*"evil" + 0.023*"children" + 0.021*"day" + 0.020*"abraham" + 0.019*"glori" + 0.018*"afraid"
0.101*"god" + 0.074*"lord" + 0.061*"peopl" + 0.047*"truth" + 0.041*"deed" + 0.036*"book" + 0.035*"forgiv" + 0.033*"turn" + 0.032*"great" + 0.028*"merci"
0.161*"messeng" + 0.149*"son" + 0.083*"spirit" + 0.080*"prophet" + 0.024*"testifi" + 0.023*"wait" + 0.022*"wife" + 0.019*"man" + 0.018*"name" + 0.017*"daughter"


In [38]:
print(len(ot_pre_list))
print(len(nt_pre_list))
print(len(quran_pre_list))

20765
7100
5606


# 3. Text Classification

In [198]:
# ##########################################
# # stemming + stopping
# # Shuffle the order of data
# all_X = ot_pre_list.copy()
# all_X.extend(nt_pre_list)
# all_X.extend(quran_pre_list)

# all_y = []
# for i in range(len(ot_pre_list)):
#     all_y.extend(['ot'])
# for j in range(len(nt_pre_list)):
#     all_y.extend(['nt'])
# for k in range(len(quran_pre_list)):
#     all_y.extend(['quran'])

# # Split the dataset into training set and a seperate development set
# X_training, X_deving, y_training, y_deving = train_test_split(all_X, all_y, train_size=0.9, test_size=0.1, shuffle=True)

# # get vocab set
# vocab_set = set([])

# for doc1 in ot_pre_list:
#     for word1 in doc1:
#         vocab_set.add(word1)
# for doc2 in nt_pre_list:
#     for word2 in doc2:
#         vocab_set.add(word2)
# for doc3 in quran_pre_list:
#     for word3 in doc3:
#         vocab_set.add(word3)
# ##########################################

In [39]:
# tokenisation
r = """[0-9!#$%&'"()*+,-./:;\\\<=>?@[\]^_`{|}~\n]"""

def tokenising(s):
    # delete punctuation
    no_punct = re.sub(r, ' ', s)

    # transform to lower_case
    no_punct = no_punct.lower()

    # put string to words list
    no_list = no_punct.split()
    
    return no_list

In [40]:
%%time
# get tokenising word_list of OT, NT and Quran corpus
# every list in corpus represents a document
ot_token_list = []; nt_token_list = []; quran_token_list = []

for i in range(len(ot_list)):
    line = tokenising(ot_list[i])
    if (len(line) > 0):
        ot_token_list.extend([line])

for j in range(len(nt_list)):
    line = tokenising(nt_list[j])
    if (len(line) > 0):
        nt_token_list.extend([line])

for k in range(len(quran_list)):
    line = tokenising(quran_list[k])
    if (len(line) > 0):
        quran_token_list.extend([line])

CPU times: user 239 ms, sys: 34.8 ms, total: 274 ms
Wall time: 308 ms


In [41]:
print('length of ot_token_list: ' + str(len(ot_token_list)))
print('length of nt_token_list: ' + str(len(nt_token_list)))
print('length of quran_token_list: ' + str(len(quran_token_list)))

length of ot_token_list: 20766
length of nt_token_list: 7112
length of quran_token_list: 5616


In [42]:
# get vocab set
vocab_set = set([])

for doc1 in ot_token_list:
    for word1 in doc1:
        vocab_set.add(word1)
for doc2 in nt_token_list:
    for word2 in doc2:
        vocab_set.add(word2)
for doc3 in quran_token_list:
    for word3 in doc3:
        vocab_set.add(word3)

In [43]:
# Shuffle the order of data
all_X = ot_token_list.copy()
all_X.extend(nt_token_list)
all_X.extend(quran_token_list)

all_y = []
for i in range(len(ot_token_list)):
    all_y.extend(['ot'])
for j in range(len(nt_token_list)):
    all_y.extend(['nt'])
for k in range(len(quran_token_list)):
    all_y.extend(['quran'])

# Split the dataset into training set and a seperate development set
X_training, X_deving, y_training, y_deving = train_test_split(all_X, all_y, train_size=0.9, test_size=0.1, shuffle=True)

### Process test data

In [44]:
# ########################################################
# # stemming + stopping
# with open('test.tsv', 'r') as f:
#     test = f.readlines()

# test_list = []
# for d in test:
#     a = d.rstrip('\n').split('\t')
#     test_list.extend([a])
    
# # get X_testing and y_testing
# X_testing = []; y_testing = []
# for i in range(len(test_list)):
#     line = preprocessing(test_list[i][1])
#     if len(line) > 0:
#         X_testing.extend([line])
#         y_testing.extend([test_list[i][0].lower()])
# ########################################################

In [45]:
# len(X_testing)

In [46]:
# read test.tsv file
with open('test.tsv', 'r') as f:
    test = f.readlines()

test_list = []
for d in test:
    a = d.rstrip('\n').split('\t')
    test_list.extend([a])
    
# get doc_list and label_list
doc_list = []; label_list = []
for i in range(len(test_list)):
    doc_list.extend([test_list[i][1]])
    label_list.extend([test_list[i][0]])

In [47]:
# tokenise doc_list and get X_testing and y_testing
X_testing = []; y_testing = []
for i in range(len(doc_list)):
    line = tokenising(doc_list[i])
    if (len(line) > 0):
        X_testing.extend([line])
        y_testing.extend([label_list[i].lower()])

In [48]:
len(y_testing)

3843

### Set up mappings for word and category IDs

In [49]:
# convert the vocab to a word id lookup dictionary
# anything not in this will be considered "out of vocabulary" OOV
word2id = {}
for word_id,word in enumerate(vocab_set):
    word2id[word] = word_id
    
# and do the same for the categories
categories = ['ot', 'nt', 'quran']
cat2id = {}
for cat_id,cat in enumerate(set(categories)):
    cat2id[cat] = cat_id
    
print("The word id for dog is",word2id['dog'])
print("The category id for nt is",cat2id['nt'])

The word id for dog is 11046
The category id for nt is 0


### Convert data to bag-of-words format

In [50]:
# build a BOW representation of the files: use the scipy 
# data is the preprocessed_data
# word2id maps words to their ids
def convert_to_bow_matrix(preprocessed_data, word2id):
    
    # matrix size is number of docs x vocab size + 1 (for OOV)
    matrix_size = (len(preprocessed_data),len(word2id)+1)
    oov_index = len(word2id)
    # matrix indexed by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)

    # iterate through all documents in the dataset
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            # default is 0, so just add to the count for this word in this doc
            # if the word is oov, increment the oov_index
            X[doc_id,word2id.get(word,oov_index)] += 1
    
    return X

In [51]:
%%time 
# generate X_train
X_train = convert_to_bow_matrix(X_training, word2id)
# generate y_train
y_train = [cat2id[cat] for cat in y_training]

CPU times: user 21.7 s, sys: 184 ms, total: 21.9 s
Wall time: 22.9 s


In [52]:
# check some docs
# print("First 3 documents in X_train are:",X_train[:3])
# print("First 3 documents in y_train are:",y_train[:3])

### Train an SVM model

In [71]:
%%time
model = sklearn.svm.LinearSVC(C=1000, max_iter=5000)
# model = sklearn.svm.SVC(C=1000)
model.fit(X_train,y_train)

CPU times: user 12 s, sys: 166 ms, total: 12.2 s
Wall time: 12.5 s




LinearSVC(C=1000, max_iter=5000)

### Evaluating the model (using train set)

In [54]:
# evaluate on training data: how well did we fit to the data we trained on?
y_train_predictions = model.predict(X_train)

# now can compute any metrics we care about. Let's quickly do accuracy
def compute_accuracy(predictions, true_values):
    num_correct = 0
    num_total = len(predictions)
    for predicted,true in zip(predictions, true_values):
        if predicted==true:
            num_correct += 1
    return num_correct / num_total

accuracy = compute_accuracy(y_train_predictions, y_train)
print("Accuracy:",accuracy)

Accuracy: 0.9999336518046709


In [55]:
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)

In [56]:
# compute the precision, recall and f1-score for train
print(classification_report(y_train, y_train_predictions, target_names=cat_names, digits=3))

              precision    recall  f1-score   support

          nt      1.000     1.000     1.000      6421
          ot      1.000     1.000     1.000     18645
       quran      1.000     1.000     1.000      5078

    accuracy                          1.000     30144
   macro avg      1.000     1.000     1.000     30144
weighted avg      1.000     1.000     1.000     30144



### Using dev set

In [80]:
def compute_abc_accuracy(predictions, true_values):
    num_correct = 0
    error_num = 0
    num_total = len(predictions)
    for predicted,true in zip(predictions, true_values):
        if predicted==true:
            num_correct += 1
        else:
            print(error_num)
        error_num += 1
        
    return num_correct / num_total

In [85]:
# prepare dev data in the same was as training data
X_dev = convert_to_bow_matrix(X_deving, word2id)
y_dev = [cat2id[cat] for cat in y_deving]

y_dev_predictions = model.predict(X_dev)
accuracy = compute_accuracy(y_dev_predictions, y_dev)
print("Accuracy:", accuracy)

Accuracy: 0.8588059701492538


In [58]:
# compute the precision, recall and f1-score for dev
print(classification_report(y_dev, y_dev_predictions, target_names=cat_names, digits=3))

              precision    recall  f1-score   support

          nt      0.878     0.825     0.851       691
          ot      0.934     0.960     0.947      2121
       quran      0.933     0.903     0.918       538

    accuracy                          0.923      3350
   macro avg      0.915     0.896     0.905      3350
weighted avg      0.922     0.923     0.922      3350



### Using test set

In [59]:
# prepare dev data in the same was as training data
X_test = convert_to_bow_matrix(X_testing, word2id)
y_test = [cat2id[cat] for cat in y_testing]

y_test_predictions = model.predict(X_test)
accuracy = compute_accuracy(y_test_predictions, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9175123601353109


In [70]:
# compute the precision, recall and f1-score for dev
print(classification_report(y_test, y_test_predictions, target_names=cat_names, digits=3))

              precision    recall  f1-score   support

          nt      0.881     0.832     0.856       844
          ot      0.928     0.953     0.940      2379
       quran      0.924     0.900     0.912       620

    accuracy                          0.918      3843
   macro avg      0.911     0.895     0.902      3843
weighted avg      0.917     0.918     0.917      3843



## Improved Model

In [117]:
# generate df dictionary
N = len(X_training)
df_dict = {}
for w in vocab_set:
    df_dict[w] = 0
df_set = set(df_dict.keys())

for i in range(len(X_training)):
    words = set(X_training[i])
    for word in words:
        if word in df_set:
            df_dict[word] += 1

In [118]:
def convert_to_tfidf_matrix(preprocessed_data, word2id):
    
    # matrix size is number of docs x vocab size + 1 (for OOV)
    matrix_size = (len(preprocessed_data),len(word2id))

    # matrix indexed by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)

    # iterate through all documents in the dataset
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            # default is 0, so just add to the count for this word in this doc
            if word in df_set:
                X[doc_id, word2id.get(word)] += 1
        for word in doc:
            if word in df_set:
                a = (1 + np.log10(X[doc_id, word2id.get(word)])) * np.log10(N / df_dict[word])
                X[doc_id, word2id.get(word)] = a
    
    return X

In [119]:
%%time 
# # generate X_train_improved
# X_train_improved = convert_to_tfidf_matrix(X_training, word2id)
# # generate y_train_improved
# y_train_improved = [cat2id[cat] for cat in y_training]
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train)
X_train_improved = tf_transformer.transform(X_train)
y_train_improved = [cat2id[cat] for cat in y_training]

CPU times: user 251 ms, sys: 9.64 ms, total: 260 ms
Wall time: 274 ms


### Train a SVM Model (improved)

In [120]:
%%time
improved_model = sklearn.svm.LinearSVC(C=1000, max_iter=5000)
improved_model.fit(X_train_improved, y_train_improved)

CPU times: user 12.7 s, sys: 49.8 ms, total: 12.7 s
Wall time: 12.8 s




LinearSVC(C=1000, max_iter=5000)

### Evaluating the improved Model (using train set)

In [121]:
# evaluate on training data: how well did we fit to the data we trained on?
y_train_improved_predictions = improved_model.predict(X_train_improved)
accuracy = compute_accuracy(y_train_improved_predictions, y_train_improved)
print("Accuracy:",accuracy)

Accuracy: 0.9965167197452229


In [122]:
# compute the precision, recall and f1-score for train
print(classification_report(y_train_improved, y_train_improved_predictions, target_names=cat_names, digits=3))

              precision    recall  f1-score   support

          ot      0.997     0.998     0.997     18695
          nt      0.993     0.991     0.992      6372
       quran      1.000     0.999     1.000      5077

    accuracy                          0.997     30144
   macro avg      0.997     0.996     0.996     30144
weighted avg      0.997     0.997     0.997     30144



### Using dev set

In [123]:
# prepare dev data in the same was as training data
tf_transformer2 = TfidfTransformer(use_idf=False).fit(X_dev)
X_dev_improved = tf_transformer2.transform(X_dev)
y_dev_improved = [cat2id[cat] for cat in y_deving]
# X_dev_improved = convert_to_tfidf_matrix(X_deving, word2id)
# y_dev_improved = [cat2id[cat] for cat in y_deving]

y_dev_improved_predictions = improved_model.predict(X_dev_improved)
accuracy = compute_accuracy(y_dev_improved_predictions, y_dev_improved)
print("Accuracy:", accuracy)

Accuracy: 0.8722388059701492


In [124]:
# compute the precision, recall and f1-score for dev
print(classification_report(y_dev_improved, y_dev_improved_predictions, target_names=cat_names, digits=3))

              precision    recall  f1-score   support

          ot      0.920     0.902     0.911      2071
          nt      0.757     0.793     0.775       740
       quran      0.858     0.865     0.861       539

    accuracy                          0.872      3350
   macro avg      0.845     0.853     0.849      3350
weighted avg      0.874     0.872     0.873      3350



### Using test set

In [125]:
# prepare dev data in the same was as training data
tf_transformer3 = TfidfTransformer(use_idf=False).fit(X_test)
X_test_improved = tf_transformer2.transform(X_test)
y_test_improved = [cat2id[cat] for cat in y_testing]

y_test_improved_predictions = improved_model.predict(X_test_improved)
accuracy = compute_accuracy(y_test_improved_predictions, y_test_improved)
print("Accuracy:", accuracy)

Accuracy: 0.882903981264637


In [69]:
# compute the precision, recall and f1-score for dev
print(classification_report(y_test_improved, y_test_improved_predictions, target_names=cat_names, digits=3))

NameError: name 'y_test_improved' is not defined

# Output csv

In [68]:
f = open('classification.csv', 'w', encoding='utf-8')

csv_writer = csv.writer(f)
csv_writer.writerow(['system', 'split', 'p-quran', 'r-quran', 'f-quran', 'p-ot', 'r-ot', 'f-ot', 'p-nt', 'r-nt', 'f-nt', 'p-macro', 'r-macro', 'f-macro'])

csv_writer.writerow(['baseline', 'train', 0.998, 0.994, 0.996, 0.997, 0.991, 0.994, 0.972, 0.993, 0.982, 0.989, 0.992, 0.991])
csv_writer.writerow(['baseline', 'dev', 0.826, 0.852, 0.838, 0.917, 0.874, 0.895, 0.714, 0.792, 0.751, 0.819, 0.839, 0.828])
csv_writer.writerow(['baseline', 'test', 0.843, 0.831, 0.837, 0.921, 0.888, 0.904, 0.744, 0.827, 0.783, 0.836, 0.848, 0.841])

csv_writer.writerow(['improved', 'train', 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000])
csv_writer.writerow(['improved', 'dev', 0.933, 0.903, 0.918, 0.878, 0.825, 0.851, 0.934, 0.960, 0.947, 0.915, 0.896, 0.905])
csv_writer.writerow(['improved', 'test', 0.924, 0.900, 0.912, 0.881, 0.832, 0.856, 0.928, 0.953, 0.940, 0.911, 0.895, 0.902])



84

In [None]:
# # make a prediction
# n = 2
# sample_text = X_deving[n]
# # create just a single vector as input (as a 1 x V matrix)
# sample_x_in = scipy.sparse.dok_matrix((1,len(word2id)+1))
# for word in sample_text:
#     sample_x_in[0,word2id[word]] += 1

# # what does the example document look like?
# print(sample_x_in)
# prediction = model.predict(sample_x_in)
# # what category was predicted?
# print("Prediction was:", prediction[0])
# # what category was that?
# print(cat2id)
# # what category in real?
# print("Real was:", y_deving[n])