In [3]:
import re
import matplotlib.pyplot as plt
import numpy as np
import math
from nltk.stem import PorterStemmer
from collections import Counter, defaultdict
import time
from itertools import chain
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import sklearn
from sklearn import ensemble
from sklearn.metrics import classification_report
import scipy
from scipy import sparse
import csv
from statistics import mean
from sklearn.model_selection import train_test_split

## Evaluation of systems

In [41]:
class EVAL:
    
    def __init__(self,results,rels):
        f = open(results)
        data = csv.reader(f)
        self.data_results = [line for line in data][1:]

        f = open(rels)
        data = csv.reader(f)
        relevance = [line for line in data][1:]
        self.rel = defaultdict(dict)
        for query,doc,relev in relevance:
            self.rel[query][doc] = relev
   
        self.sys_1 = defaultdict(dict)
        self.sys_2 = defaultdict(dict)
        self.sys_3 = defaultdict(dict)
        self.sys_4 = defaultdict(dict)
        self.sys_5 = defaultdict(dict)
        self.sys_6 = defaultdict(dict)
            
        for sys,query,docnum,rank,score in self.data_results:
            if sys == '1':
                self.sys_1[query][docnum] = (self.rel[query].get(docnum,0),rank,float(score))
            elif sys == '2':
                self.sys_2[query][docnum] = (self.rel[query].get(docnum,0),rank,float(score))
            elif sys == '3':
                self.sys_3[query][docnum] = (self.rel[query].get(docnum,0),rank,float(score))
            elif sys == '4':
                self.sys_4[query][docnum] = (self.rel[query].get(docnum,0),rank,float(score))
            elif sys == '5':
                self.sys_5[query][docnum] = (self.rel[query].get(docnum,0),rank,float(score))
            elif sys == '6':
                self.sys_6[query][docnum] = (self.rel[query].get(docnum,0),rank,float(score))
        
        self.nDCG1 = self.nDCG(self.sys_1,self.rel,20)
        self.nDCG2 = self.nDCG(self.sys_2,self.rel,20)
        self.nDCG3 = self.nDCG(self.sys_3,self.rel,20)
        self.nDCG4 = self.nDCG(self.sys_4,self.rel,20)
        self.nDCG5 = self.nDCG(self.sys_5,self.rel,20)
        self.nDCG6 = self.nDCG(self.sys_6,self.rel,20)

    def prec(self,system,cutoff):
        sys = []
        for i in range(1,11):
            results = []
            for elem in list(system[str(i)].items())[:cutoff]:
                results.append(elem)

            r = 0
            for doc, scores in results:
                if scores[0] != 0:
                    r += 1

            sys.append((i,(round(r/cutoff,3))))

        return sys
    
    def recall(self,system,cutoff,rel):
        num_reldocs = {query:len(docs) for query,docs in rel.items()}
        sys = []
        for i in range(1,11):
            results = []
            for elem in list(system[str(i)].items())[:cutoff]:
                results.append(elem)

            r = 0
            for doc, scores in results:
                if scores[0] != 0:
                    r += 1

            sys.append(((i,round(r/num_reldocs[str(i)],3))))

        return sys
        
    def rprec(self,system,rel):
        num_reldocs = {query:len(docs) for query,docs in rel.items()}
        sys = []
        for i in range(1,11):
            cutoff = num_reldocs[str(i)]
            results = []
            for elem in list(system[str(i)].items())[:cutoff]:
                results.append(elem)

            r = 0
            for doc, scores in results:
                if scores[0] != 0:
                    r += 1

            sys.append(((i,round(r/cutoff,3))))

        return sys

    def AP(self,system,rel):
        num_reldocs = {query:len(docs) for query,docs in rel.items()}
        sys = []
        for i in range(1,11):
            results = []
            r_count = 0
            for idx, elem in enumerate(list(system[str(i)].items()),1):
                if elem[1][0] != 0:
                    r_count += 1 
                    results.append(r_count/idx)

            sys.append(((i,round(sum(results)/num_reldocs[str(i)],3))))

        return sys
    
    def nDCG(self,system,rel,cutoff):
        sys = []
        for i in range(1,11):
            iG = list(rel[str(i)].values()) # Get rel docs for query i
            iG += [str(0)] * (500 - len(iG)) # Assign 0 for docs without relevance
            DG = int(list(system[str(i)].values())[0][0]) # rel(1)
            iDG = int(iG[0]) # rel(1) iDG
            DG_list = []
            iDG_list = []
            nDCG_list = []
            DG_list.append((0,DG))
            iDG_list.append((0,iDG))
            for idx, elem in enumerate(list(system[str(i)].items())[1:cutoff],2):
                DG += int(elem[1][0])/math.log(idx,2)
                iDG += int(iG[idx-1])/math.log(idx,2)
                nDCG_list.append(round(DG/iDG,3))

            sys.append(((i,nDCG_list[-1])))
    
        return sys

    def write_res(self,filepath):
        sys = [self.sys_1,self.sys_2,self.sys_3,self.sys_4,self.sys_5,self.sys_6]
        f = open(filepath, 'w')
        f.write("system_number,query_number,P@10,R@50,r-precision,AP,nDCG@10,nDCG@20\n")
        for i in range(0,6):
            m_prec, m_rec, m_rprec, m_AP, m_nDCG10, m_nDCG20 = [], [], [], [], [], []
            for query in range(0,10):
                m_prec.append(self.prec(sys[i],10)[query][1])
                m_rec.append(self.recall(sys[i],50,self.rel)[query][1])
                m_rprec.append(self.rprec(sys[i],self.rel)[query][1])
                m_AP.append(self.AP(sys[i],self.rel)[query][1])
                m_nDCG10.append(self.nDCG(sys[i],self.rel,10)[query][1])
                m_nDCG20.append(self.nDCG(sys[i],self.rel,20)[query][1])
                
                f.write(str(i+1)+','+str(query+1)+','+str(m_prec[query])+','+
                        str(m_rec[query])+','+str(m_rprec[query])+
                        ','+str(m_AP[query])+','+str(m_nDCG10[query])+
                        ','+str(m_nDCG20[query])+'\n')
            
            f.write(str(i+1)+','+'mean'+','+str(round(mean(m_prec),3))+','+str(round(mean(m_rec),3))+
                    ','+str(round(mean(m_rprec),3))+','+str(round(mean(m_AP),3))+','+str(round(mean(m_nDCG10),3))+
            ','+str(round(mean(m_nDCG20),3))+'\n')

In [42]:
test = EVAL('system_results.csv', 'qrels.csv')
test.write_res('test_results.csv')

In [43]:
from scipy import stats

In [44]:
P1.mean()

0.36279999999999996

In [45]:
P1 = [i[1] for i in test.nDCG1]
P2 = [i[1] for i in test.nDCG2]
P3 = [i[1] for i in test.nDCG3]
P4 = [i[1] for i in test.nDCG4]
P5 = [i[1] for i in test.nDCG5]
P6 = [i[1] for i in test.nDCG6]

P1 = np.array(P1)
P2 = np.array(P2)
P3 = np.array(P3)
P4 = np.array(P4)
P5 = np.array(P5)
P6 = np.array(P6)

stats.ttest_ind(P3, P6)

Ttest_indResult(statistic=0.16879803537044438, pvalue=0.8678378460308467)

## Text analysis

In [14]:
!cat train_and_dev.tsv | head -10

OT	In the beginning God created the heavens and the earth.
OT	The earth was without form, and void; and darkness was on the face of the deep. And the Spirit of God was hovering over the face of the waters.
OT	Then God said, "Let there be light"; and there was light.
OT	And God saw the light, that it was good; and God divided the light from the darkness.
OT	God called the light Day, and the darkness He called Night. So the evening and the morning were the first day.
OT	Thus God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament; and it was so.
OT	And God called the firmament Heaven. So the evening and the morning were the second day.
OT	Then God said, "Let the waters under the heavens be gathered together into one place, and let the dry land appear"; and it was so.
OT	And God called the dry land Earth, and the gathering together of the waters He called Seas. And God saw that it was good.
OT	Then God said, "Le

In [15]:
f = open('train_and_dev.tsv', 'r')
data = f.read()

f = open('englishST.txt', 'r')
STwords = [word.rstrip() for word in f.readlines()]

In [16]:
ps = PorterStemmer()

In [17]:
OT = re.findall(r'OT\t(.+)', data)
OT = [re.findall(r'\w+', verse) for verse in OT]
OT_pro = {ID:[ps.stem(word) for word in verse if word.lower() not in STwords] for ID,verse in enumerate(OT)}

NT = re.findall(r'NT\t(.+)', data)
NT = [re.findall(r'\w+', verse) for verse in NT]
NT_pro = {ID+len(OT):[ps.stem(word) for word in verse if word.lower() not in STwords] for ID,verse in enumerate(NT)}

quran = re.findall(r'Quran\t(.+)', data)
quran = [re.findall(r'\w+', verse) for verse in quran]
quran_pro = {ID+len(OT)+len(NT):[ps.stem(word) for word in verse if word.lower() not in STwords] for ID,verse in enumerate(quran)}

vocab = set(list(chain.from_iterable(list(OT_pro.values()) + list(NT_pro.values()) + list(quran_pro.values()))))
labels = re.findall(r'(OT|NT|Quran)\t', data)

In [18]:
def inv_index(OT_pro, NT_pro, quran_pro):
    inv_index = defaultdict(list)
    combined = {docid:verse for docid,verse in enumerate(list(OT_pro.values())+list(NT_pro.values())+list(quran_pro.values()))}
    for word in vocab:
        for docid,verse in combined.items():
            if word in verse:
                if word in inv_index:     
                    if docid in OT_pro.keys():
                        inv_index[word][0][1] += 1

                    if docid in NT_pro.keys():
                        inv_index[word][0][2] += 1

                    if docid in quran_pro.keys():
                        inv_index[word][0][3] += 1
                else:
                    inv_index[word].append({})
                    inv_index[word][0][1] = 0
                    inv_index[word][0][2] = 0
                    inv_index[word][0][3] = 0
                    if docid in OT_pro.keys():
                        inv_index[word][0][1] = 1
                    if docid in NT_pro.keys():
                        inv_index[word][0][2] = 1
                    if docid in quran_pro.keys():
                        inv_index[word][0][3] = 1
    
    return inv_index

In [19]:
start = time.time()
inv_index = inv_index(OT_pro,NT_pro,quran_pro)
end = time.time()
print(end-start)

51.04818820953369


In [20]:
def calculate_nterms(inv_index):
    
    N11_1, N11_2, N11_3 = defaultdict(int),defaultdict(int),defaultdict(int)
    N10_1, N10_2, N10_3 = defaultdict(int),defaultdict(int),defaultdict(int)
    N01_1, N01_2, N01_3 = defaultdict(int),defaultdict(int),defaultdict(int)
    N00_1, N00_2, N00_3 = defaultdict(int),defaultdict(int),defaultdict(int)
    
    for word,val in inv_index.items():
        N11_1[word] = val[0][1]
        N01_1[word] = len(OT_pro) - val[0][1]
        N10_1[word] = val[0][2] + val[0][3]
        N00_1[word] = len(NT_pro)+len(quran_pro) - (val[0][2] + val[0][3])
        
        N11_2[word] = val[0][2]
        N01_2[word] = len(NT) - val[0][2]
        N10_2[word] = val[0][1] + val[0][3]
        N00_2[word] = len(OT_pro)+len(quran_pro) - (val[0][1] + val[0][3])
        
        N11_3[word] = val[0][3]
        N01_3[word] = len(quran) - val[0][3]
        N10_3[word] = val[0][1] + val[0][2]
        N00_3[word] = len(OT_pro)+len(NT_pro) - (val[0][1] + val[0][2])
    
    return N11_1,N01_1,N10_1,N00_1, N11_2,N01_2,N10_2,N00_2, N11_3,N01_3,N10_3,N00_3

In [21]:
def MI(N,N11,N01,N10,N00,vocab):
    
        N1X = {key:N11.get(key,0)+N10.get(key,0) for key in set(N11)|set(N10)}
        NX1 = {key:N11.get(key,0)+N01.get(key,0) for key in set(N11)|set(N01)}
        N0X = {key:N01.get(key,0)+N00.get(key,0) for key in set(N01)|set(N00)}
        NX0 = {key:N10.get(key,0)+N00.get(key,0) for key in set(N10)|set(N00)}
        
        MI = {}
    
        for key in vocab:
            try:
                first = (N11.get(key,0)/N)*math.log((N*N11.get(key,0))/(N1X.get(key,0)*NX1.get(key,0)),2)
            except (ValueError,ZeroDivisionError):
                first = 0
            try:
                second = (N01.get(key,0)/N)*math.log((N*N01.get(key,0))/(N0X.get(key,0)*NX1.get(key,0)),2)
            except (ValueError,ZeroDivisionError):
                second = 0
            try:
                third = (N10.get(key,0)/N)*math.log((N*N10.get(key,0))/(N1X.get(key,0)*NX0.get(key,0)),2)
            except (ValueError,ZeroDivisionError):
                third = 0
            try:
                fourth = (N00.get(key,0)/N)*math.log((N*N00.get(key,0))/(N0X.get(key,0)*NX0.get(key,0)),2)
            except (ValueError,ZeroDivisionError):
                fourth = 0

            MI[key] = first+second+third+fourth
    
        return {k: v for k, v in sorted(MI.items(), key=lambda item: item[1], reverse=True)}

In [22]:
def Chi(N11,N01,N10,N00,vocab):
    
    Chi = {}
    
    for key in vocab:
        try:
            Chi[key] = (((N11.get(key,0)+N10.get(key,0)+N01.get(key,0)+N00.get(key,0)) * 
                                     (N11.get(key,0)*N00.get(key,0)-N10.get(key,0)*N01.get(key,0))**2) / 
                                ((N11.get(key,0)+N01.get(key,0)) * (N11.get(key,0)+N10.get(key,0)) * 
                                 (N10.get(key,0)+N00.get(key,0)) * (N01.get(key,0)+N00.get(key,0))))
        except ZeroDivisionError:
            Chi[key] = 0
    
    return {k: v for k, v in sorted(Chi.items(), key=lambda item: item[1], reverse=True)}

In [23]:
start = time.time()
N = len(OT+NT+quran)
N11_1,N01_1,N10_1,N00_1, N11_2,N01_2,N10_2,N00_2, N11_3,N01_3,N10_3,N00_3 = calculate_nterms(inv_index)
MI_OT = MI(N,N11_1,N01_1,N10_1,N00_1,vocab)
Chi_OT = Chi(N11_1,N01_1,N10_1,N00_1,vocab)

MI_NT = MI(N,N11_2,N01_2,N10_2,N00_2,vocab)
Chi_NT = Chi(N11_2,N01_2,N10_2,N00_2,vocab)

MI_quran = MI(N,N11_3,N01_3,N10_3,N00_3,vocab)
Chi_quran = Chi(N11_3,N01_3,N10_3,N00_3,vocab)

end = time.time()
print(end-start)

0.24220919609069824


In [None]:
MI_OT

In [None]:
MI_NT

In [None]:
MI_quran

In [None]:
Chi_OT

In [None]:
Chi_NT

In [None]:
Chi_quran

## Topic analysis

In [30]:
def topic_analysis(corpus1,corpus2,corpus3):
    
    combined_corpus = list(list(corpus1.values())+list(corpus2.values()) + list(corpus3.values()))
    common_texts = [list(x) for x in combined_corpus]
    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    
    c1 = [common_dictionary.doc2bow(text) for text in corpus1.values()]
    c2 = [common_dictionary.doc2bow(text) for text in corpus2.values()]
    c3 = [common_dictionary.doc2bow(text) for text in corpus3.values()]

    num_topics = 20

    lda = LdaModel(common_corpus, num_topics=num_topics, id2word=common_dictionary, random_state=25, iterations=50000)
    
    topics_scores1 = defaultdict(float)
    topics_scores2 = defaultdict(float)
    topics_scores3 = defaultdict(float)

    for doc in c1:
        for docid, prob in lda.get_document_topics(doc):
            topics_scores1[docid] += prob
        
    for doc in c2:
        for docid, prob in lda.get_document_topics(doc):
            topics_scores2[docid] += prob
            
    for doc in c3:
        for docid, prob in lda.get_document_topics(doc):
            topics_scores3[docid] += prob
    
    topics_scores1 = {key:val/len(c1) for key,val in topics_scores1.items()}
    max_c1 = {key:val for key,val in topics_scores1.items() if val == max(topics_scores1.values())}
    top_c1 = lda.print_topic(list(max_c1.keys())[0])
    print(f'Top topic for corpus 1: Topic {list(max_c1.keys())[0]}')

    topics_scores2 = {key:val/len(c2) for key,val in topics_scores2.items()}
    max_c2 = {key:val for key,val in topics_scores2.items() if val == max(topics_scores2.values())}
    top_c2 = lda.print_topic(list(max_c2.keys())[0])
    print(f'Top topic for corpus 2: Topic {list(max_c2.keys())[0]}')
    
    topics_scores3 = {key:val/len(c3) for key,val in topics_scores3.items()}
    max_c3 = {key:val for key,val in topics_scores3.items() if val == max(topics_scores3.values())}
    top_c3 = lda.print_topic(list(max_c3.keys())[0])
    print(f'Top topic for corpus 3: Topic {list(max_c3.keys())[0]}')
    
    print(lda.print_topic(17))
    
    
    lda_display = gensimvis.prepare(lda, common_corpus, common_dictionary)

    return lda_display, top_c1, top_c2, top_c3

In [None]:
disp,t1,t2,t3 = topic_analysis(OT_pro,NT_pro,quran_pro)

In [32]:
t1

'0.129*"god" + 0.103*"believ" + 0.052*"lord" + 0.047*"king" + 0.044*"command" + 0.041*"judgment" + 0.039*"peopl" + 0.034*"law" + 0.033*"mose" + 0.031*"day"'

In [33]:
t2

'0.122*"god" + 0.058*"lord" + 0.037*"heart" + 0.034*"peopl" + 0.033*"forgiv" + 0.032*"word" + 0.030*"thing" + 0.029*"reward" + 0.024*"power" + 0.021*"hear"'

In [34]:
t3

'0.122*"god" + 0.058*"lord" + 0.037*"heart" + 0.034*"peopl" + 0.033*"forgiv" + 0.032*"word" + 0.030*"thing" + 0.029*"reward" + 0.024*"power" + 0.021*"hear"'

In [35]:
pyLDAvis.display(disp)

# Text classification

In [36]:
np.random.seed(50)

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
combined = OT + NT + quran
labels = re.findall(r'(OT|NT|Quran)\t', data)

In [39]:
combined = OT + NT + quran
labels = re.findall(r'(OT|NT|Quran)\t', data)
vocab = set(list(chain.from_iterable(combined)))

### Create BOW features

In [41]:
def map2id(vocab, classes):
    class2id = {c:ID for ID,c in enumerate(set(classes))}
    word2id = {word:ID for ID,word in enumerate(vocab)}
    
    return class2id, word2id

In [42]:
def BOW_matrix(data, word2id):
    
    matrix_size = (len(data),len(word2id)+1)
    # OOV = out of vocabulary, for words out of vocabulary.
    oov_index = len(word2id)
    
    X = scipy.sparse.dok_matrix(matrix_size)
    print(f'X matrix is of shape:{X.shape}')
    
    for docid, doc in enumerate(data):
        for word in doc:
            X[docid, word2id.get(word,oov_index)] +=1
    
    return X

In [43]:
def train_model(X_t,y_t,w2id,c2id):
    X_t = BOW_matrix(X_t, w2id)
    y_t = [c2id[c] for c in y_t]
    model = sklearn.svm.LinearSVC(C=1000, random_state=25)
    model.fit(X_t,y_t)
    return model

In [44]:
class2id, word2id = map2id(vocab,labels)
print(f'Number of features: {len(vocab)}')

Number of features: 15441


### Shuffle and split

In [45]:
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=25)
f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [46]:
start = time.time()
model = train_model(X_train, y_train, word2id, class2id)
end = time.time()
print(end-start)

X matrix is of shape:(30144, 15442)
14.000240802764893




### Evaluate on validation

In [47]:
def evaluate_model(X_t,y_t,X_v,y_v,X_test,y_test,w2id,c2id,model):
    X_t = BOW_matrix(X_t, w2id)
    y_t = [c2id[c] for c in y_t]
    y_t_predictions = model.predict(X_t)
    
    X_v = BOW_matrix(X_v, w2id)
    y_v = [c2id[c] for c in y_v]
    y_v_predictions = model.predict(X_v)
    
    X_test = BOW_matrix(X_test, w2id)
    y_test = [c2id[c] for c in y_test]
    y_test_predictions = model.predict(X_test)
    
    class_names = []
    for c,cid in sorted(c2id.items(),key=lambda x:x[1]):
        class_names.append(c)
    return classification_report(y_t, y_t_predictions, target_names=class_names, output_dict=False), classification_report(y_v, y_v_predictions, target_names=class_names, output_dict=False), classification_report(y_test, y_test_predictions, target_names=class_names, output_dict=False)


In [48]:
report_train, report_val, report_test = evaluate_model(X_train,y_train,X_val,y_val,X_test,y_test,word2id,class2id,model)

X matrix is of shape:(30144, 15442)
X matrix is of shape:(3350, 15442)
X matrix is of shape:(3843, 15442)


In [49]:
print(report_train)

              precision    recall  f1-score   support

       Quran       1.00      1.00      1.00      5034
          OT       1.00      1.00      1.00     18676
          NT       1.00      1.00      1.00      6434

    accuracy                           1.00     30144
   macro avg       1.00      1.00      1.00     30144
weighted avg       1.00      1.00      1.00     30144



In [50]:
print(report_val)

              precision    recall  f1-score   support

       Quran       0.90      0.90      0.90       582
          OT       0.94      0.91      0.92      2090
          NT       0.75      0.82      0.79       678

    accuracy                           0.89      3350
   macro avg       0.86      0.88      0.87      3350
weighted avg       0.89      0.89      0.89      3350



In [51]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.88      0.90      0.89       620
          OT       0.94      0.91      0.93      2379
          NT       0.79      0.85      0.82       844

    accuracy                           0.90      3843
   macro avg       0.87      0.89      0.88      3843
weighted avg       0.90      0.90      0.90      3843



## Change classifier parameters

### Change to linear classifier

In [99]:
def train_model(X_t,y_t,w2id,c2id):
    X_t = BOW_matrix(X_t, w2id)
    y_t = [c2id[c] for c in y_t]
    model = sklearn.svm.LinearSVC(C=1000, random_state=25)
    model.fit(X_t,y_t)
    return model

In [100]:
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=25)
f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [101]:
model = train_model(X_train, y_train, word2id, class2id)
report_train,report_val, report_test = evaluate_model(X_train,y_train,X_val,y_val,X_test,y_test,word2id,class2id,model)
print(report_train)

X matrix is of shape:(30144, 15442)




X matrix is of shape:(30144, 15442)
X matrix is of shape:(3350, 15442)
X matrix is of shape:(3843, 15442)
              precision    recall  f1-score   support

          OT       0.99      1.00      1.00     18673
       Quran       1.00      1.00      1.00      5042
          NT       0.99      0.98      0.99      6429

    accuracy                           0.99     30144
   macro avg       0.99      0.99      0.99     30144
weighted avg       0.99      0.99      0.99     30144



In [102]:
print(report_val)

              precision    recall  f1-score   support

          OT       0.95      0.95      0.95      2093
       Quran       0.92      0.94      0.93       574
          NT       0.84      0.83      0.84       683

    accuracy                           0.92      3350
   macro avg       0.90      0.91      0.90      3350
weighted avg       0.92      0.92      0.92      3350



In [103]:
print(report_test)

              precision    recall  f1-score   support

          OT       0.95      0.95      0.95      2379
       Quran       0.94      0.95      0.94       620
          NT       0.86      0.86      0.86       844

    accuracy                           0.93      3843
   macro avg       0.92      0.92      0.92      3843
weighted avg       0.93      0.93      0.93      3843



### Change regularization value

In [232]:
def train_model(X_t,y_t,w2id,c2id):
    X_t = BOW_matrix(X_t, w2id)
    y_t = [c2id[c] for c in y_t]
    model = sklearn.svm.LinearSVC(C=0.05, random_state=50, max_iter=300)
    model.fit(X_t,y_t)
    return model

In [233]:
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)
f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [234]:
model = train_model(X_train, y_train, word2id, class2id)
report_val, report_test = evaluate_model(X_val,y_val,X_test,y_test,word2id,class2id,model)
print(report_val)

X matrix is of shape:(30144, 15442)
X matrix is of shape:(3350, 15442)
X matrix is of shape:(3843, 15442)
              precision    recall  f1-score   support

       Quran       0.94      0.93      0.94       574
          OT       0.94      0.97      0.95      2093
          NT       0.90      0.81      0.85       683

    accuracy                           0.93      3350
   macro avg       0.92      0.90      0.91      3350
weighted avg       0.93      0.93      0.93      3350



In [235]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.94      0.93      0.94       620
          OT       0.94      0.96      0.95      2379
          NT       0.90      0.84      0.87       844

    accuracy                           0.93      3843
   macro avg       0.93      0.91      0.92      3843
weighted avg       0.93      0.93      0.93      3843



### Change classifier 

In [274]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [275]:
def train_model(X_t,y_t,w2id,c2id):
    X_t = BOW_matrix(X_t, w2id)
    y_t = [c2id[c] for c in y_t]
    model = LogisticRegression(random_state=50, max_iter=100).fit(X_t, y_t)
    model.fit(X_t,y_t)
    return model

In [278]:
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)
f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [None]:
model = train_model(X_train, y_train, word2id, class2id)
report_val, report_test = evaluate_model(X_val,y_val,X_test,y_test,word2id,class2id,model)
print(report_val)

In [249]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.93      0.94      0.94       620
          OT       0.94      0.95      0.95      2379
          NT       0.88      0.85      0.86       844

    accuracy                           0.93      3843
   macro avg       0.92      0.91      0.92      3843
weighted avg       0.93      0.93      0.93      3843



## tfidf

In [52]:
def synon(word):
    list_syn = []
    try:
        syn = wordnet.synsets(word)[0]
    except:
        return word
    try:
        return re.sub(r'_','',syn.lemma_names()[1])
    except:
        return word

In [53]:
def replace_syn(sentence):
    new_sentence = []
    for i in sentence:
        new_sentence.append(synon(i))
    return new_sentence

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

In [55]:
NT_syn = list(map(replace_syn,NT))

In [56]:
combined = OT + NT + quran + NT_syn
labels = re.findall(r'(OT|NT|Quran)\t', data) + ['NT' for i in range(len(NT_syn))]
labels = labels 
vocab = set(list(chain.from_iterable(combined)))

In [57]:
class2id = {c:ID for ID,c in enumerate(set(labels))}

In [58]:
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=25)
f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [59]:
X_train = [' '.join(verse) for verse in X_train]
X_val = [' '.join(verse) for verse in X_val]
X_test = [' '.join(verse) for verse in X_test]

y_train = [class2id[c] for c in y_train]
y_val = [class2id[c] for c in y_val]
y_test = [class2id[c] for c in y_test]

In [60]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train = vectorizer.fit_transform(X_train)
dic_vocabulary = vectorizer.vocabulary_

X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)


X_train = normalize(X_train, norm='l2', axis=1)
X_val = normalize(X_val, norm='l2', axis=1)
X_test = normalize(X_test, norm='l2', axis=1)

In [61]:
start = time.time()
model = sklearn.svm.LinearSVC(C=1, random_state=25)
model.fit(X_train,y_train)
end = time.time()
print(end-start)

0.7484133243560791


In [62]:
X_train

<36545x171569 sparse matrix of type '<class 'numpy.float64'>'
	with 1506346 stored elements in Compressed Sparse Row format>

In [63]:
class_names = []
for c,cid in sorted(class2id.items(),key=lambda x:x[1]):
    class_names.append(c)

y_train_predictions = model.predict(X_train)
y_val_predictions = model.predict(X_val)
y_test_predictions = model.predict(X_test)

report_train = classification_report(y_train, y_train_predictions, target_names=class_names, output_dict=False)
report_val = classification_report(y_val, y_val_predictions, target_names=class_names, output_dict=False)
report_test = classification_report(y_test, y_test_predictions, target_names=class_names, output_dict=False)

In [64]:
y_train_predictions

array([1, 1, 2, ..., 1, 2, 0])

In [65]:
print(report_train)

              precision    recall  f1-score   support

       Quran       1.00      1.00      1.00      5060
          OT       1.00      1.00      1.00     18656
          NT       1.00      1.00      1.00     12829

    accuracy                           1.00     36545
   macro avg       1.00      1.00      1.00     36545
weighted avg       1.00      1.00      1.00     36545



In [66]:
print(report_val)

              precision    recall  f1-score   support

       Quran       0.99      0.92      0.96       556
          OT       0.98      0.97      0.97      2110
          NT       0.95      0.98      0.97      1395

    accuracy                           0.97      4061
   macro avg       0.97      0.96      0.97      4061
weighted avg       0.97      0.97      0.97      4061



In [67]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.98      0.92      0.95       620
          OT       0.95      0.97      0.96      2379
          NT       0.91      0.89      0.90       844

    accuracy                           0.94      3843
   macro avg       0.95      0.93      0.94      3843
weighted avg       0.94      0.94      0.94      3843



### MI and TFIDF

## Pre-processing data

### Stemming 

In [121]:
combined = OT + NT + quran
combined = [[ps.stem(word) for word in verse] for verse in combined]
vocab = set(list(chain.from_iterable(combined)))
class2id, word2id = map2id(vocab,labels)

f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
X_test = [[ps.stem(word) for word in verse] for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [124]:
class2id, word2id = map2id(vocab,labels)
print(f'Number of features: {len(vocab)}')
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)
model = train_model(X_train, y_train, word2id, class2id)
report_val, report_test = evaluate_model(X_val,y_val,X_test,y_test,word2id,class2id,model)
print(report_val)

Number of features: 9065
X matrix is of shape:(30144, 9066)




X matrix is of shape:(3350, 9066)
X matrix is of shape:(3843, 9066)
              precision    recall  f1-score   support

       Quran       0.92      0.91      0.91       574
          OT       0.94      0.94      0.94      2093
          NT       0.81      0.81      0.81       683

    accuracy                           0.91      3350
   macro avg       0.89      0.88      0.88      3350
weighted avg       0.91      0.91      0.91      3350



In [125]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.90      0.90      0.90       620
          OT       0.93      0.94      0.94      2379
          NT       0.85      0.83      0.84       844

    accuracy                           0.91      3843
   macro avg       0.89      0.89      0.89      3843
weighted avg       0.91      0.91      0.91      3843



### Stopword removal

In [126]:
combined = OT + NT + quran
combined = [[word for word in verse if word.lower() not in STwords] for verse in combined]
vocab = set(list(chain.from_iterable(combined)))
class2id, word2id = map2id(vocab,labels)

f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
X_test = [[word for word in verse if word.lower() not in STwords] for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [127]:
class2id, word2id = map2id(vocab,labels)
print(f'Number of features: {len(vocab)}')
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)

model = train_model(X_train, y_train, word2id, class2id)
report_val, report_test = evaluate_model(X_val,y_val,X_test,y_test,word2id,class2id,model)

print(report_val)

Number of features: 14719
X matrix is of shape:(30144, 14720)
X matrix is of shape:(3350, 14720)
X matrix is of shape:(3843, 14720)
              precision    recall  f1-score   support

       Quran       0.89      0.88      0.88       574
          OT       0.93      0.93      0.93      2093
          NT       0.81      0.81      0.81       683

    accuracy                           0.90      3350
   macro avg       0.88      0.87      0.87      3350
weighted avg       0.90      0.90      0.90      3350



In [128]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.89      0.89      0.89       620
          OT       0.92      0.93      0.93      2379
          NT       0.84      0.80      0.82       844

    accuracy                           0.90      3843
   macro avg       0.88      0.88      0.88      3843
weighted avg       0.90      0.90      0.90      3843



### Lowercase

In [129]:
combined = OT + NT + quran
combined = [[word.lower() for word in verse] for verse in combined]
vocab = set(list(chain.from_iterable(combined)))
class2id, word2id = map2id(vocab,labels)

f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
X_test = [[word.lower() for word in verse] for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [130]:
class2id, word2id = map2id(vocab,labels)
print(f'Number of features: {len(vocab)}')
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)

model = train_model(X_train, y_train, word2id, class2id)
report_val, report_test = evaluate_model(X_val,y_val,X_test,y_test,word2id,class2id,model)

print(report_val)

Number of features: 13896
X matrix is of shape:(30144, 13897)




X matrix is of shape:(3350, 13897)
X matrix is of shape:(3843, 13897)
              precision    recall  f1-score   support

       Quran       0.93      0.92      0.92       574
          OT       0.94      0.94      0.94      2093
          NT       0.84      0.83      0.83       683

    accuracy                           0.92      3350
   macro avg       0.90      0.90      0.90      3350
weighted avg       0.92      0.92      0.92      3350



In [131]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.92      0.91      0.92       620
          OT       0.93      0.94      0.93      2379
          NT       0.84      0.83      0.83       844

    accuracy                           0.91      3843
   macro avg       0.90      0.89      0.89      3843
weighted avg       0.91      0.91      0.91      3843



### Normalize data

In [134]:
from sklearn.preprocessing import normalize

In [154]:
def train_model(X_t,y_t,w2id,c2id):
    X_t = BOW_matrix(X_t, w2id)
    X_t = normalize(X_t, norm='l2', axis=0)
    y_t = [c2id[c] for c in y_t]
    model = sklearn.svm.LinearSVC(C=1, random_state=50)
    model.fit(X_t,y_t)
    return model

In [155]:
def evaluate_model(X_v,y_v,X_test,y_test,w2id,c2id,model):
    X_v = BOW_matrix(X_v, w2id)
    X_v = normalize(X_v, norm='l2', axis=0)
    y_v = [c2id[c] for c in y_v]
    y_v_predictions = model.predict(X_v)
    
    X_test = BOW_matrix(X_test, w2id)
    X_test = normalize(X_test, norm='l2', axis=0)
    y_test = [c2id[c] for c in y_test]
    y_test_predictions = model.predict(X_test)
    
    class_names = []
    for c,cid in sorted(c2id.items(),key=lambda x:x[1]):
        class_names.append(c)
    return classification_report(y_v, y_v_predictions, target_names=class_names, output_dict=False), classification_report(y_test, y_test_predictions, target_names=class_names, output_dict=False)

In [156]:
combined = OT + NT + quran
vocab = set(list(chain.from_iterable(combined)))

f = open('test.tsv')
test = f.read()
X_test = re.findall(r'(?:OT\t|NT\t|Quran\t)(.+)', test)
X_test = [re.findall(r'\w+', verse) for verse in X_test]
X_test = [[word for word in verse if word.lower() not in STwords] for verse in X_test]
y_test = re.findall(r'(OT|NT|Quran)\t', test)

In [157]:
class2id, word2id = map2id(vocab,labels)
print(f'Number of features: {len(vocab)}')
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.2, random_state=50)

model = train_model(X_train, y_train, word2id, class2id)
report_val, report_test = evaluate_model(X_val,y_val,X_test,y_test,word2id,class2id,model)

print(report_val)

Number of features: 15441
X matrix is of shape:(26795, 15442)
X matrix is of shape:(6699, 15442)
X matrix is of shape:(3843, 15442)
              precision    recall  f1-score   support

       Quran       0.98      0.89      0.93      1126
          OT       0.92      0.98      0.95      4151
          NT       0.90      0.82      0.86      1422

    accuracy                           0.93      6699
   macro avg       0.94      0.90      0.91      6699
weighted avg       0.93      0.93      0.93      6699



In [158]:
print(report_test)

              precision    recall  f1-score   support

       Quran       0.90      0.88      0.89       620
          OT       0.91      0.95      0.93      2379
          NT       0.88      0.78      0.83       844

    accuracy                           0.90      3843
   macro avg       0.90      0.87      0.88      3843
weighted avg       0.90      0.90      0.90      3843



### Lemmatizer

In [339]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

In [340]:
combined = OT + NT + quran
combined = [[lm.lemmatize(word.lower()) for word in verse] for verse in combined]
vocab = set(list(chain.from_iterable(combined)))

In [341]:
class2id, word2id = map2id(vocab,labels)
print(f'Number of features: {len(vocab)}')
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)
model = train_model(X_train, y_train, word2id, class2id)
report = evaluate_model(X_val,y_val,word2id,class2id,model)
print(report)

Number of features: 12190
X matrix is of shape:(30144, 12191)
X matrix is of shape:(3350, 12191)
              precision    recall  f1-score   support

          OT       0.92      0.96      0.94      2093
       Quran       0.97      0.87      0.92       574
          NT       0.85      0.81      0.83       683

    accuracy                           0.91      3350
   macro avg       0.91      0.88      0.90      3350
weighted avg       0.91      0.91      0.91      3350



## Use MI for feature selection

In [None]:
def train_model(X_t,y_t,w2id,c2id):
    X_t = BOW_matrix(X_t, w2id)
    y_t = [c2id[c] for c in y_t]
    model = sklearn.svm.SVC(C=1, random_state=50)
    model.fit(X_t,y_t)
    return model

In [None]:
def evaluate_model(X_v,y_v,X_test,y_test,w2id,c2id,model):
    X_v = BOW_matrix(X_v, w2id)
    X_v = normalize(X_v, norm='l2', axis=0)
    y_v = [c2id[c] for c in y_v]
    y_v_predictions = model.predict(X_v)
    
    X_test = BOW_matrix(X_test, w2id)
    X_test = normalize(X_test, norm='l2', axis=0)
    y_test = [c2id[c] for c in y_test]
    y_test_predictions = model.predict(X_test)
    
    class_names = []
    for c,cid in sorted(c2id.items(),key=lambda x:x[1]):
        class_names.append(c)
    return classification_report(y_v, y_v_predictions, target_names=class_names, output_dict=False), classification_report(y_test, y_test_predictions, target_names=class_names, output_dict=False)

In [378]:
combined = OT + NT + quran
combined = [[ps.stem(word) for word in verse] for verse in combined]
vocab = set(list(chain.from_iterable(combined)))

In [379]:
def select_features(MI_OT, MI_NT, MI_quran, n=100):
    combined = {k:max(MI_OT.get(k, 0), MI_NT.get(k, 0), MI_quran.get(k,0)) for k in set(chain(MI_OT,MI_NT,MI_quran))}
    combined = {k: v for k, v in sorted(combined.items(), key=lambda item: item[1], reverse=True)}
    top = [word for word in combined.keys()][:n]
    return top

In [380]:
def select_features_fair(MI_OT, MI_NT, MI_quran, n=100):
    OT = [word for word in MI_OT.keys()][:n]
    NT = [word for word in MI_NT.keys() if word not in OT][:n]
    quran = [word for word in MI_quran.keys() if word not in OT + NT][:n]
    combined = OT + NT + quran
    
    return combined

In [401]:
top = select_features_fair(MI_OT, MI_NT, MI_quran, n=1100)
vocab_MI = set(top)
class2id_MI, word2id_MI = map2id(vocab_MI,labels)
print(f'Number of features: {len(vocab_MI)}')

Number of features: 3300


In [402]:
# X,y = sklearn.utils.shuffle(combined,labels, random_state=20)
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.15, random_state=50)

In [403]:
model = train_model(X_train, y_train, word2id_MI, class2id_MI)

X matrix is of shape:(28469, 3301)


In [404]:
report = evaluate_model(X_val,y_val,word2id_MI,class2id_MI,model)

X matrix is of shape:(5025, 3301)


In [405]:
print(report)

              precision    recall  f1-score   support

          OT       0.89      0.96      0.93      3123
       Quran       0.92      0.85      0.88       856
          NT       0.86      0.72      0.78      1046

    accuracy                           0.89      5025
   macro avg       0.89      0.84      0.86      5025
weighted avg       0.89      0.89      0.89      5025



## Tune number of features using MI

In [350]:
# def evaluate_model(X_v,y_v,w2id,c2id,model):
#     X_v = BOW_matrix(X_v, w2id)
#     y_v = [c2id[c] for c in y_v]
#     y_v_predictions = model.predict(X_v)
#     class_names = []
#     for c,cid in sorted(c2id.items(),key=lambda x:x[1]):
#         class_names.append(c)
#     return classification_report(y_v, y_v_predictions, target_names=class_names, output_dict=False)

In [351]:
combined = [[ps.stem(word) for word in verse] for verse in combined]
vocab = set(list(chain.from_iterable(combined)))

In [352]:
# nfeatures = list(np.arange(500,3000,100))
# results = {}
# for num in nfeatures:
#     top = select_features_fair(MI_OT, MI_NT, MI_quran, n=num)
#     vocab_MI = set(top)
#     class2id_MI, word2id_MI = map2id(vocab_MI,labels)
#     print(f'Number of features: {len(vocab_MI)}')
#     X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)
#     model = train_model(X_train, y_train, word2id_MI, class2id_MI)
#     report = evaluate_model(X_val,y_val,word2id_MI,class2id_MI,model)
#     results[num] = report

In [353]:
# macro = {}
# weighted = {}
# for num in results.keys():
#     macro[num] = results[num]['macro avg']['f1-score']
#     weighted[num] = results[num]['weighted avg']['f1-score']

In [354]:
# macro = {k: v for k, v in sorted(macro.items(), key=lambda item: item[1], reverse=True)}
# macro

In [355]:
# weighted = {k: v for k, v in sorted(weighted.items(), key=lambda item: item[1], reverse=True)}
# weighted

## Using X^2 for feature selection

In [367]:
combined = OT + NT + quran
# combined = [[ps.stem(word) for word in verse] for verse in combined]
vocab = set(list(chain.from_iterable(combined)))

In [368]:
top = select_features_fair(Chi_OT, Chi_NT, Chi_quran, n=800)
vocab_Chi = set(top)
class2id_Chi, word2id_Chi = map2id(vocab_Chi,labels)
print(f'Number of features: {len(vocab_Chi)}')

Number of features: 2400


In [369]:
# X,y = sklearn.utils.shuffle(combined,labels, random_state=20)
X_train, X_val, y_train, y_val = train_test_split(combined, labels, test_size=0.1, random_state=50)

In [370]:
model = train_model(X_train, y_train, word2id_Chi, class2id_Chi)

X matrix is of shape:(30144, 2401)


In [371]:
report = evaluate_model(X_val,y_val,word2id_Chi,class2id_Chi,model)

X matrix is of shape:(3350, 2401)


In [372]:
print(report)

              precision    recall  f1-score   support

          OT       0.78      0.90      0.84      2093
       Quran       0.74      0.60      0.66       574
          NT       0.65      0.44      0.52       683

    accuracy                           0.76      3350
   macro avg       0.72      0.65      0.67      3350
weighted avg       0.75      0.76      0.74      3350

