In [1]:
import csv
import math
import random
import string
import sklearn
import scipy
import numpy as np

from scipy.stats import ttest_ind
from nltk.stem.porter import *
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
from scipy import sparse
from sklearn import svm
from sklearn.linear_model import SGDClassifier



In [2]:
# read csv files, putting them into dictionaries where keys are system number and query number respectively
sysRel = {}
qRel = {}
with open('system_results.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if int(row['system_number']) in sysRel.keys():
            sysRel[int(row['system_number'])].append((int(row['query_number']),int(row['doc_number']),int(row['rank_of_doc']),float(row['score'])))
        else:
            sysRel[int(row['system_number'])] = [(int(row['query_number']),int(row['doc_number']),int(row['rank_of_doc']),float(row['score']))]

with open('qrels.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if int(row['query_id']) in qRel.keys():
            qRel[int(row['query_id'])].append((int(row['doc_id']),int(row['relevance'])))
        else:
            qRel[int(row['query_id'])] = [(int(row['doc_id']),int(row['relevance']))]

In [3]:
print(sysRel)
    

{1: [(1, 6567, 1, 5.0743), (1, 9652, 2, 4.4829), (1, 9684, 3, 4.3478), (1, 7844, 4, 4.3268), (1, 9584, 5, 4.216), (1, 7853, 6, 4.0384), (1, 9090, 7, 4.0021), (1, 9574, 8, 4.0011), (1, 6850, 9, 3.9796), (1, 9616, 10, 3.9646), (1, 9281, 11, 3.8694), (1, 9891, 12, 3.8448), (1, 9665, 13, 3.8105), (1, 8446, 14, 3.8105), (1, 9591, 15, 3.7642), (1, 9198, 16, 3.7352), (1, 5690, 17, 3.6808), (1, 5493, 18, 3.6772), (1, 6915, 19, 3.6539), (1, 6171, 20, 3.6375), (1, 5023, 21, 3.6375), (1, 7732, 22, 3.6234), (1, 8871, 23, 3.5779), (1, 8124, 24, 3.5058), (1, 6602, 25, 3.485), (1, 7353, 26, 3.447), (1, 8564, 27, 3.3441), (1, 8419, 28, 3.3176), (1, 7735, 29, 3.2894), (1, 7730, 30, 3.2745), (1, 9069, 31, 3.2184), (1, 8179, 32, 3.2013), (1, 7745, 33, 3.2), (1, 9625, 34, 3.1213), (1, 9394, 35, 3.1213), (1, 9156, 36, 3.1213), (1, 9961, 37, 3.1156), (1, 5936, 38, 3.1156), (1, 5014, 39, 3.0769), (1, 7811, 40, 3.0696), (1, 6675, 41, 3.0696), (1, 7980, 42, 3.0168), (1, 7553, 43, 2.9637), (1, 6473, 44, 2.9541)

In [4]:
print(qRel)

{1: [(9090, 3), (6850, 2), (9574, 2), (8709, 1), (9684, 1), (5011, 1)], 2: [(5715, 2), (9677, 2), (5766, 2), (6327, 1), (6079, 1), (5653, 1), (6498, 1), (7117, 1)], 3: [(9743, 3)], 4: [(6491, 3), (5269, 3), (8032, 3), (9444, 3), (8988, 2), (9445, 2), (5883, 2), (7435, 2), (9745, 1), (10029, 1), (7224, 1), (9038, 1), (7827, 1), (6675, 1), (9720, 1), (6289, 1), (9746, 1), (6836, 1), (10119, 1), (4742, 1), (9739, 1), (5783, 1), (10117, 1), (8414, 1), (5865, 1), (8315, 1), (9523, 1), (8318, 1), (6288, 1), (5268, 1), (7620, 1), (7046, 1), (6054, 1), (9744, 1), (6743, 1), (9278, 1), (8562, 1), (6382, 1), (6334, 1), (6292, 1)], 5: [(1646, 1), (2126, 1), (3111, 1), (4983, 1), (8646, 1), (6669, 1), (8282, 1)], 6: [(8433, 3), (7487, 3), (6736, 3), (5305, 1), (8844, 1), (9736, 1), (9541, 1), (8261, 1), (8120, 1), (7424, 1), (8593, 1), (7737, 1)], 7: [(7646, 3), (3156, 2), (4144, 1)], 8: [(9891, 3), (7844, 3), (9574, 2), (9684, 2), (9090, 1), (9652, 1), (9281, 1), (6171, 1)], 9: [(5884, 3), (5995,

In [14]:
def EVAL(sysRel, qRel):
    irEval = [["system_number", "query_number", "P@10", "R@50", "r-precision", "AP", "nDCG@10", "nDCG@20"]]  # final output list
    for system in sysRel.keys():
        meanP10 = 0.0
        meanR50 = 0.0
        meanRP = 0.0
        meanAP = 0.0
        meanNDCG10 = 0.0
        meanNDCG20 = 0.0
        for query in range(1,len(qRel)+1):
            currentSys = sysRel[system]
            currentQ = qRel[query]

            relevantRes = set([x[0] for x in currentQ])
            # precision at 10
            top10Res = set([x[1] for x in currentSys if x[0] == query][:10])
            precision10 = (len((relevantRes).intersection(top10Res)) / 10)
            meanP10 += float(precision10)
            precision10 = "%.3f" % round(precision10, 3)

            # recall at 50
            top50Res = set([x[1] for x in currentSys if x[0] == query][:50])
            recall50 = len(relevantRes.intersection(top50Res))/len(relevantRes)
            meanR50 += float(recall50)
            recall50 = "%.3f" % round(recall50, 3)

            # r-precision
            topRRes = set([x[1] for x in currentSys if x[0] == query][:len(relevantRes)])
            precisionR = (len((relevantRes).intersection(topRRes)) / len(relevantRes))
            meanRP += float(precisionR)
            precisionR = "%.3f" % round(precisionR, 3)

            # average precision
            allRes = [x[1] for x in currentSys if x[0] == query]
            averagePrecision = 0
            relevanceRank = 1
            for i in range(len(allRes)):
                if allRes[i] in relevantRes:
                    averagePrecision += relevanceRank / (i+1)
                    relevanceRank += 1
            averagePrecision = averagePrecision / len(relevantRes)
            meanAP += float(averagePrecision)
            averagePrecision = "%.3f" % round(averagePrecision, 3)

            # nDCG at 10
            top10Res = [x[1] for x in currentSys if x[0] == query][:10]
            relevanceVal = [x[1] for x in currentQ]
            dcg = 0 # discounted cumulative gain
            ig = [x[1] for x in currentQ] # ideal gains
            for i in range(len(top10Res)):
                if top10Res[i] in relevantRes:
                    if i <= 1:
                        dcg += int("".join([str(x[1]) for x in currentQ if x[0] == top10Res[i]]))
                    else:
                        dcg += int("".join([str(x[1]) for x in currentQ if x[0] == top10Res[i]])) * (1 / math.log2(i + 1))
            ig.sort(reverse=True)
            idcg = 0 # ideal discounted cumulative gain
            for i in range(len(ig)):
                if (i == 10):
                    break
                if i <= 1:
                    idcg += ig[i]
                else:
                    idcg += ig[i] * (1/math.log2(i+1))
            ndcg10 = 0
            if (idcg != 0):
                ndcg10 = (dcg / idcg)
                meanNDCG10 += float(ndcg10)
                ndcg10 = "%.3f" % round(ndcg10, 3)

            # nDCG at 20
            top20Res = [x[1] for x in currentSys if x[0] == query][:20]
            relevanceVal = [x[1] for x in currentQ]
            dcg = 0 # discounted cumulative gain
            ig = [x[1] for x in currentQ] # ideal gains
            for i in range(len(top20Res)):
                if top20Res[i] in relevantRes:
                    if i <= 1:
                        dcg += int("".join([str(x[1]) for x in currentQ if x[0] == top20Res[i]]))
                    else:
                        dcg += int("".join([str(x[1]) for x in currentQ if x[0] == top20Res[i]])) * (1 / math.log2(i + 1))
            ig.sort(reverse=True)
            idcg = 0 # ideal discounted cumulative gain
            for i in range(len(ig)):
                if (i == 20):
                    break
                if i <= 1:
                    idcg += ig[i]
                else:
                    idcg += ig[i] * (1/math.log2(i+1))
            ndcg20 = 0
            if (idcg != 0):
                ndcg20 = (dcg / idcg)
                meanNDCG20 += float(ndcg20)
                ndcg20 = "%.3f" % round(ndcg20, 3)

            irEval.append([system, query, precision10, recall50, precisionR, averagePrecision, ndcg10, ndcg20])
        irEval.append([system, "mean", "%.3f" % round((meanP10 / 10), 3), "%.3f" % round((meanR50 / 10), 3), "%.3f" % round((meanRP / 10), 3), "%.3f" % round((meanAP / 10), 3), "%.3f" % round((meanNDCG10 / 10), 3), "%.3f" % round((meanNDCG20 / 10), 3)])

    with open("ir_eval.csv", "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(irEval)

    # IR Evaluation, 2 tailed t-test
    sys1 = [float(x[7]) for x in irEval if x[0] == 3 and x[1] != "mean"]
    print(sys1)
    sys2 = [float(x[7]) for x in irEval if x[0] == 6 and x[1] != "mean"]
    print(ttest_ind(sys1, sys2))


In [15]:
EVAL(sysRel, qRel)

[0.733, 0.897, 0.24, 0.704, 0.233, 0.449, 0.0, 0.78, 0.584, 0.488]
Ttest_indResult(statistic=0.16879803537044438, pvalue=0.8678378460308467)


In [16]:
'''
Task 2 begins here
'''

stemmer = PorterStemmer()
ot = []
nt = []
quran = []

# English stop words
stop_words = open("EnglishST.txt").read().split('\n')

# read files
with open("train_and_dev.tsv", newline='') as tsvfile:
    reader = csv.reader(tsvfile, delimiter="\t")
    for row in reader:
        if row[0] == "OT":
            ot.append(row[1])
        if row[0] == "NT":
            nt.append(row[1])
        if row[0] == "Quran":
            quran.append(row[1])

# preprocessing
otTokens = []
ntTokens = []
quranTokens = []

for sentence in ot:
    otTokens.append([stemmer.stem(i) for i in re.split('[^\w^\d]', sentence.lower()) if i not in stop_words and i != ''])
for sentence in nt:
    ntTokens.append([stemmer.stem(i) for i in re.split('[^\w^\d]', sentence.lower()) if i not in stop_words and i != ''])
for sentence in quran:
    quranTokens.append([stemmer.stem(i) for i in re.split('[^\w^\d]', sentence.lower()) if i not in stop_words and i != ''])

In [17]:
# calculate term frequency
otTokensFreq = {}
ntTokensFreq = {}
quranTokensFreq = {}

for sentence in otTokens:
    doc = [] # count each word once in document
    for word in sentence:
        if word not in otTokensFreq.keys():
            otTokensFreq[word] = 1
            doc.append(word)
            continue
        if word not in doc:
            otTokensFreq[word] += 1
            doc.append(word)

for sentence in ntTokens:
    doc = [] # count each word once in document
    for word in sentence:
        if word not in ntTokensFreq.keys():
            ntTokensFreq[word] = 1
            doc.append(word)
            continue
        if word not in doc:
            ntTokensFreq[word] += 1
            doc.append(word)

for sentence in quranTokens:
    doc = [] # count each word once in document
    for word in sentence:
        if word not in quranTokensFreq.keys():
            quranTokensFreq[word] = 1
            doc.append(word)
            continue
        if word not in doc:
            quranTokensFreq[word] += 1
            doc.append(word)

In [18]:
# calculate MI for a corpus
# tfCor1 is the current corpus, and nCor1 is the number of words in corpus one.
def calculateMI(tfCor1, tfCor2, tfCor3, nCor1, nCor2, nCor3):
    MI = {}
    uniqTerms = set(tfCor1.keys()).union(set(tfCor2.keys()), set(tfCor3.keys()))
    # calculate MI for each term
    for term in uniqTerms:
        N11 = 0
        N10 = 0
        N01 = 0
        N00 = 0
        if term in tfCor1.keys():
            N11 = tfCor1[term]
        if term in tfCor2.keys():
            N10 += tfCor2[term]
        if term in tfCor3.keys():
            N10 += tfCor3[term]
        N01 = nCor1 - N11
        N00 = nCor1 + nCor2 + nCor3 - N11 - N10 - N01
        N = N11+N10+N01+N00
        # four parts of the equation from lecture slides
        if (N*N11/((N10+N11)*(N11+N01))) == 0:
            part1 = 0
        else:
            part1 = N11/N * math.log2(N*N11/((N10+N11)*(N11+N01)))

        if N*N01/((N00+N01)*(N01+N11)) == 0:
            part2 = 0
        else:
            part2 = N01/N * math.log2(N*N01/((N00+N01)*(N01+N11)))

        if N*N10/((N10+N11)*(N10+N00)) == 0:
            part3 = 0
        else:
            part3 = N10/N * math.log2(N*N10/((N10+N11)*(N10+N00)))

        if N*N00/((N00+N01)*(N10+N00)) == 0:
            part4 = 0
        else:
            part4 = N00/N * math.log2(N*N00/((N00+N01)*(N10+N00)))
        tokenMI = part1+part2+part3+part4
        MI[term] = tokenMI
    return MI

# calculate MI's
otMI = sorted(calculateMI(otTokensFreq, ntTokensFreq, quranTokensFreq, len(otTokens), len(ntTokens), len(quranTokens)).items(), key=lambda item:item[1], reverse=True)
ntMI = sorted(calculateMI(ntTokensFreq, otTokensFreq, quranTokensFreq, len(ntTokens), len(otTokens), len(quranTokens)).items(), key=lambda item:item[1], reverse=True)
quranMI = sorted(calculateMI(quranTokensFreq, ntTokensFreq, otTokensFreq, len(quranTokens), len(ntTokens), len(otTokens)).items(), key=lambda item:item[1], reverse=True)

In [19]:
# calculate Chi2 for a corpus
# tfCor1 is the current corpus, and nCor1 is the number of words in corpus one.
def calculateChi2(tfCor1, tfCor2, tfCor3, nCor1, nCor2, nCor3):
    Chi2 = {}
    uniqTerms = set(tfCor1.keys()).union(set(tfCor2.keys()), set(tfCor3.keys()))
    # calculate Chi2 for each term
    for term in uniqTerms:
        N11 = 0
        N10 = 0
        N01 = 0
        N00 = 0
        if term in tfCor1.keys():
            N11 = tfCor1[term]
        if term in tfCor2.keys():
            N10 += tfCor2[term]
        if term in tfCor3.keys():
            N10 += tfCor3[term]
        N01 = nCor1 - N11
        N00 = nCor1 + nCor2 + nCor3 - N11 - N10 - N01
        N = N11 + N10 + N01 + N00
        # two parts of the equation from lecture slides
        top = N*(N11*N00-N10*N01)**2
        bot = (N11+N01)*(N11+N10)*(N10+N00)*(N01+N00)
        
        termChi2 = top/bot
        Chi2[term] = termChi2
    return Chi2

# calculate Chi2's
otChi2 = sorted(calculateChi2(otTokensFreq, ntTokensFreq, quranTokensFreq, len(otTokens), len(ntTokens), len(quranTokens)).items(), key=lambda item:item[1], reverse=True)
ntChi2 = sorted(calculateChi2(ntTokensFreq, otTokensFreq, quranTokensFreq, len(ntTokens), len(otTokens), len(quranTokens)).items(), key=lambda item:item[1], reverse=True)
quranChi2 = sorted(calculateChi2(quranTokensFreq, ntTokensFreq, otTokensFreq, len(quranTokens), len(ntTokens), len(otTokens)).items(), key=lambda item:item[1], reverse=True)

In [20]:
# Generate a ranked list of the results
print("Generating ranked list for OT:\n")
print("token\tMI score\ttoken\tChi2 score\n")
for i in range(10):
    print(otMI[i][0] + "\t" + str(round(otMI[i][1],5)) + "\t\t" + otChi2[i][0] + "\t" + str(round(otChi2[i][1],5)))
    print("\n")
    
print("Generating ranked list for NT:\n")
print("token\tMI score\ttoken\tChi2 score\n")
for i in range(10):
    print(ntMI[i][0] + "\t" + str(round(ntMI[i][1],5)) + "\t\t" + ntChi2[i][0] + "\t" + str(round(ntChi2[i][1],5)))
    print("\n")
    
print("Generating ranked list for Quran:\n")
print("token\tMI score\ttoken\tChi2 score\n")
for i in range(10):
    print(quranMI[i][0] + "\t" + str(round(quranMI[i][1],5)) + "\t\t" + quranChi2[i][0] + "\t" + str(round(quranChi2[i][1],5)))
    print("\n")

Generating ranked list for OT:

token	MI score	token	Chi2 score

jesu	0.03866		jesu	1334.86983


israel	0.03638		lord	1213.34935


king	0.03138		israel	1177.84351


lord	0.03071		king	1044.33274


ot	0.02271		christ	709.80831


christ	0.0206		god	691.8871


believ	0.01854		believ	682.37222


son	0.01639		ot	631.65153


god	0.01613		son	620.27893


muhammad	0.01609		muhammad	553.87514


Generating ranked list for NT:

token	MI score	token	Chi2 score

jesu	0.05663		jesu	2908.46396


christ	0.03449		christ	1697.68447


lord	0.02378		lord	857.48012


israel	0.01538		discipl	778.89545


discipl	0.01527		nt	539.66805


peopl	0.0115		peter	507.35128


king	0.01146		paul	507.35128


nt	0.01094		thing	461.75896


ot	0.01091		israel	458.49897


land	0.01032		spirit	406.49446


Generating ranked list for Quran:

token	MI score	token	Chi2 score

god	0.03132		muhammad	1667.17942


muhammad	0.03021		god	1515.85169


torment	0.02059		torment	1204.04298


believ	0.02023		believ	1197.83082


messeng	0.

In [145]:
# LDA
allTokens = otTokens + ntTokens + quranTokens
common_dictionary = Dictionary(allTokens)
common_corpus = [common_dictionary.doc2bow(text) for text in allTokens]
lda = LdaModel(common_corpus, num_topics=20,id2word=common_dictionary)
docScores = lda.get_document_topics(bow=common_corpus)

In [146]:
# computes the sorted average score for each topic
def computeAverageScore(docScores):
    averageScore = {}
    for doc in docScores:
        for topic in doc: # topics are tuples of (topic, score)
            if topic[0] in averageScore:
                averageScore[topic[0]] += topic[1]
            else:
                averageScore[topic[0]] = topic[1]

    for topic in averageScore.keys():
        averageScore[topic] /= len(docScores)
    return sorted(averageScore.items(), key=lambda x: x[1], reverse=True)

In [147]:
docScores_ot = docScores[0:len(otTokens)]
docScores_nt = docScores[len(otTokens):len(otTokens) + len(ntTokens)]
docScores_quran = docScores[len(otTokens) + len(ntTokens):]

# find the top topic for each corpus
topOT = computeAverageScore(docScores_ot)[0]
topNT = computeAverageScore(docScores_nt)[0]
topQURAN = computeAverageScore(docScores_quran)[0]

# find the top 10 tokens and their probability scores
def computeTopTokens(topTopicTuple, lda):
    topic = str(topTopicTuple[0])
    probability = lda.print_topic(topTopicTuple[0], 10)
    return "topic " + topic + ":\n" + probability


print("Generating top 10 tokens and their probability scores:")
print("For OT:")
print(computeTopTokens(topOT,lda) + "\n")
print("For NT:")
print(computeTopTokens(topNT,lda) + "\n")
print("For QURAN:")
print(computeTopTokens(topQURAN,lda) + "\n")

Generating top 10 tokens and their probability scores:
For OT:
topic 3:
0.045*"god" + 0.031*"lord" + 0.018*"peopl" + 0.015*"receiv" + 0.013*"hand" + 0.010*"reject" + 0.009*"land" + 0.009*"guid" + 0.008*"made" + 0.007*"give"

For NT:
topic 2:
0.060*"god" + 0.030*"lord" + 0.016*"believ" + 0.013*"life" + 0.013*"day" + 0.011*"peopl" + 0.011*"thing" + 0.011*"faith" + 0.009*"live" + 0.009*"evil"

For QURAN:
topic 2:
0.060*"god" + 0.030*"lord" + 0.016*"believ" + 0.013*"life" + 0.013*"day" + 0.011*"peopl" + 0.011*"thing" + 0.011*"faith" + 0.009*"live" + 0.009*"evil"



In [2]:
trainingData = open("train_and_dev.tsv").read()

# preprocess data
def preProcess(data):
    removeChar = re.compile(f'[{string.punctuation}]')
    documents_train = [] # list of sentences in list of strings
    categories_train = [] # list of classes
    vocab_train = set([]) # unique words

    documents_dev = [] # list of sentences in list of strings
    categories_dev = [] # list of classes
    vocab_dev = set([]) # unique words

    lines = data.split('\n')
    random.shuffle(lines)
    N_train = int(np.ceil(len(lines) * 0.9))

    # training set
    for line in lines[0:N_train]:
        line = line.strip()
        if line:
            category, text = line.split('\t')
            words = removeChar.sub('',text).lower().split()
            for word in words:
                vocab_train.add(word)
            documents_train.append(words)
            categories_train.append(category)

    # develop set
    for line in lines[N_train:]:
        line = line.strip()
        if line:
            category, text = line.split('\t')
            words = removeChar.sub('',text).lower().split()
            for word in words:
                vocab_dev.add(word)
            documents_dev.append(words)
            categories_dev.append(category)
    return documents_train, categories_train, vocab_train, documents_dev, categories_dev, vocab_dev

# preprocess test data
def preProcessTest(data):
    removeChar = re.compile(f'[{string.punctuation}]')
    documents_test = [] # list of sentences in list of strings
    categories_test = [] # list of classes
    vocab_test = set([]) # unique words

    lines = data.split('\n')
    # training set
    for line in lines:
        line = line.strip()
        if line:
            category, text = line.split('\t')
            words = removeChar.sub('',text).lower().split()
            for word in words:
                vocab_test.add(word)
            documents_test.append(words)
            categories_test.append(category)
    return documents_test, categories_test, vocab_test

# map unique terms to an ID
def uniqueTerms(vocab):
    word2id = {}
    for wordId, word in enumerate(vocab):
        word2id[word] = wordId
    return word2id

# map unique classes to an ID
def uniqueClasses(categories):
    cat2Id = {}
    for catId, cat in enumerate(set(categories)):
        cat2Id[cat] = catId
    return cat2Id

# convert data to BOW format
def convertToBOWMatrix(preprocessedData, word2id):
    matrixSize = (len(preprocessedData),len(word2id)+1)
    oovIndex = len(word2id) # out of vocabulary index
    # [docId, tokenId] matrix
    X = scipy.sparse.dok_matrix(matrixSize)
    # iterate through each document(line) in the dataset
    for docId, doc in enumerate(preprocessedData):
        # count word, or if word not found in word2id, increment oov count
        for word in doc:
            X[docId, word2id.get(word,oovIndex)] += 1
    return X


documents_train, categories_train, vocab_train, documents_dev, categories_dev, vocab_dev = preProcess(trainingData)
word2Id = uniqueTerms(vocab_train)
cat2Id = uniqueClasses(categories_train)
# data dict in the format of (docId, wordId) -> word count
X_train = convertToBOWMatrix(documents_train, word2Id)
X_dev = convertToBOWMatrix(documents_dev, word2Id)
# data labels
y_train = [cat2Id[cat] for cat in categories_train]
y_dev = [cat2Id[cat] for cat in categories_dev]

In [3]:
# train an SVM model
baselineModel = sklearn.svm.SVC(C=1000)
baselineModel.fit(X_train, y_train)

In [29]:
y_train_pred = baselineModel.predict(X_train)
y_dev_pred = baselineModel.predict(X_dev)

In [34]:
# compute precision, recall and f1 scores
def calculateScores(category, y_pred, y_actual):
    matrix = np.zeros((2,2))
    temp = []
    for i in range(len(y_actual)):
        if y_actual[i] == category and y_pred[i] == category:
            matrix[0][0] += 1
        if y_actual[i] != category and y_pred[i] != category:
            matrix[1][1] += 1
        if y_actual[i] == category and y_pred[i] != category:
            temp.append(i)
            matrix[1][0] += 1
        if y_actual[i] != category and y_pred[i] == category:
            temp.append(i)
            matrix[0][1] += 1
    wrong_pred_idx.append(temp)

    precision = matrix[0][0] / (matrix[0][0] + matrix[0][1])
    recall = matrix[0][0] / (matrix[0][0] + matrix[1][0])
    f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

wrong_pred_idx = []

p_quran_train, r_quran_train, f1_quran_train = calculateScores(0, y_train_pred, y_train)
p_ot_train, r_ot_train, f1_ot_train = calculateScores(1, y_train_pred, y_train)
p_nt_train, r_nt_train, f1_nt_train = calculateScores(2, y_train_pred, y_train)
p_macro_train = (p_quran_train + p_ot_train + p_nt_train)/3
r_macro_train = (r_quran_train + r_ot_train + r_nt_train)/3
f1_macro_train = 2 * p_macro_train * r_macro_train / (p_macro_train + r_macro_train)

p_quran_dev, r_quran_dev, f1_quran_dev = calculateScores(0, y_dev_pred, y_dev)
p_ot_dev, r_ot_dev, f1_ot_dev = calculateScores(1, y_dev_pred, y_dev)
p_nt_dev, r_nt_dev, f1_nt_dev = calculateScores(2, y_dev_pred, y_dev)
p_macro_dev = (p_quran_dev + p_ot_dev + p_nt_dev)/3
r_macro_dev = (r_quran_dev + r_ot_dev + r_nt_dev)/3
f1_macro_dev = 2 * p_macro_dev * r_macro_dev / (p_macro_dev + r_macro_dev)

# test set
testData = open("test.tsv").read()
documents_test, categories_test, vocab_test = preProcessTest(testData)
# data dict in the format of (docId, wordId) -> word count
X_test = convertToBOWMatrix(documents_test, word2Id)
# data labels
y_test = [cat2Id[cat] for cat in categories_test]

y_test_pred = baselineModel.predict(X_test)
p_quran_test, r_quran_test, f1_quran_test = calculateScores(0, y_test_pred, y_test)
p_ot_test, r_ot_test, f1_ot_test = calculateScores(1, y_test_pred, y_test)
p_nt_test, r_nt_test, f1_nt_test = calculateScores(2, y_test_pred, y_test)
p_macro_test = (p_quran_test + p_ot_test + p_nt_test)/3
r_macro_test = (r_quran_test + r_ot_test + r_nt_test)/3
f1_macro_test = 2 * p_macro_test * r_macro_test / (p_macro_test + r_macro_test)

baselineResults = [['system','split','p-quran','r-quran','f-quran','p-ot','r-ot','f-ot','p-nt','r-nt','f-nt','p-macro','r-macro','f-macro']]
baselineResults.append(['baseline','train',str(round(p_quran_train,3)), str(round(r_quran_train,3)), str(round(f1_quran_train,3)), str(round(p_ot_train,3)), str(round(r_ot_train,3)), str(round(f1_ot_train,3)), str(round(p_nt_train,3)), str(round(r_nt_train,3)), str(round(f1_nt_train,3)), str(round(p_macro_train,3)), str(round(r_macro_train,3)), str(round(f1_macro_train,3))])
baselineResults.append(['baseline','dev',str(round(p_quran_dev,3)), str(round(r_quran_dev,3)), str(round(f1_quran_dev,3)), str(round(p_ot_dev,3)), str(round(r_ot_dev,3)), str(round(f1_ot_dev,3)), str(round(p_nt_dev,3)), str(round(r_nt_dev,3)), str(round(f1_nt_dev,3)), str(round(p_macro_dev,3)), str(round(r_macro_dev,3)), str(round(f1_macro_dev,3))])
baselineResults.append(['baseline','test',str(round(p_quran_test,3)), str(round(r_quran_test,3)), str(round(f1_quran_test,3)), str(round(p_ot_test,3)), str(round(r_ot_test,3)), str(round(f1_ot_test,3)), str(round(p_nt_test,3)), str(round(r_nt_test,3)), str(round(f1_nt_test,3)), str(round(p_macro_test,3)), str(round(r_macro_test,3)), str(round(f1_macro_test,3))])

with open("classification.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(baselineResults)

In [40]:
print(wrong_pred_idx[3])
print(y_dev_pred[50], y_dev[50])
print(y_dev_pred[84], y_dev[84])
print(y_dev_pred[90], y_dev[90])

[50, 84, 90, 99, 131, 133, 135, 137, 165, 185, 197, 213, 250, 278, 296, 305, 312, 328, 429, 519, 604, 614, 624, 668, 715, 786, 791, 832, 856, 890, 907, 975, 1029, 1052, 1118, 1131, 1227, 1273, 1320, 1321, 1350, 1358, 1365, 1401, 1407, 1424, 1443, 1445, 1446, 1491, 1659, 1754, 1769, 1827, 1845, 1871, 1957, 1967, 2047, 2139, 2175, 2245, 2273, 2314, 2318, 2361, 2377, 2405, 2463, 2473, 2540, 2564, 2579, 2586, 2639, 2685, 2745, 2830, 2882, 2915, 2941, 2963, 3011, 3048, 3052, 3110, 3134, 3166, 3168, 3174, 3227, 3305]
1 0
2 0
1 0


In [41]:
print(cat2Id)
print(documents_dev[50])
print(documents_dev[84])
print(documents_dev[90])

{'Quran': 0, 'OT': 1, 'NT': 2}
['the', 'egyptians', 'said', 'what', 'do', 'you', 'suggest', 'should', 'be', 'the', 'punishment', 'for', 'the', 'thief', 'if', 'it', 'is', 'proved', 'that', 'you', 'are', 'lying']
['we', 'cast', 'him', 'out', 'of', 'the', 'fish', 'unto', 'dry', 'land', 'and', 'he', 'was', 'sick']
['by', 'the', 'high', 'ceiling', 'heaven']


In [37]:
# improve the system
# changing the SVM parameters
improveModel = SGDClassifier(loss='modified_huber',alpha=0.0002,tol=1e-7,max_iter=5000)
improveModel.fit(X_train, y_train)

y_train_pred = improveModel.predict(X_train)
y_dev_pred = improveModel.predict(X_dev)

p_quran_train, r_quran_train, f1_quran_train = calculateScores(0, y_train_pred, y_train)
p_ot_train, r_ot_train, f1_ot_train = calculateScores(1, y_train_pred, y_train)
p_nt_train, r_nt_train, f1_nt_train = calculateScores(2, y_train_pred, y_train)
p_macro_train = (p_quran_train + p_ot_train + p_nt_train)/3
r_macro_train = (r_quran_train + r_ot_train + r_nt_train)/3
f1_macro_train = 2 * p_macro_train * r_macro_train / (p_macro_train + r_macro_train)

p_quran_dev, r_quran_dev, f1_quran_dev = calculateScores(0, y_dev_pred, y_dev)
p_ot_dev, r_ot_dev, f1_ot_dev = calculateScores(1, y_dev_pred, y_dev)
p_nt_dev, r_nt_dev, f1_nt_dev = calculateScores(2, y_dev_pred, y_dev)
p_macro_dev = (p_quran_dev + p_ot_dev + p_nt_dev)/3
r_macro_dev = (r_quran_dev + r_ot_dev + r_nt_dev)/3
f1_macro_dev = 2 * p_macro_dev * r_macro_dev / (p_macro_dev + r_macro_dev)

y_test_pred = improveModel.predict(X_test)
p_quran_test, r_quran_test, f1_quran_test = calculateScores(0, y_test_pred, y_test)
p_ot_test, r_ot_test, f1_ot_test = calculateScores(1, y_test_pred, y_test)
p_nt_test, r_nt_test, f1_nt_test = calculateScores(2, y_test_pred, y_test)
p_macro_test = (p_quran_test + p_ot_test + p_nt_test)/3
r_macro_test = (r_quran_test + r_ot_test + r_nt_test)/3
f1_macro_test = 2 * p_macro_test * r_macro_test / (p_macro_test + r_macro_test)

baselineResults.append(['improved','train',str(round(p_quran_train,3)), str(round(r_quran_train,3)), str(round(f1_quran_train,3)), str(round(p_ot_train,3)), str(round(r_ot_train,3)), str(round(f1_ot_train,3)), str(round(p_nt_train,3)), str(round(r_nt_train,3)), str(round(f1_nt_train,3)), str(round(p_macro_train,3)), str(round(r_macro_train,3)), str(round(f1_macro_train,3))])
baselineResults.append(['improved','dev',str(round(p_quran_dev,3)), str(round(r_quran_dev,3)), str(round(f1_quran_dev,3)), str(round(p_ot_dev,3)), str(round(r_ot_dev,3)), str(round(f1_ot_dev,3)), str(round(p_nt_dev,3)), str(round(r_nt_dev,3)), str(round(f1_nt_dev,3)), str(round(p_macro_dev,3)), str(round(r_macro_dev,3)), str(round(f1_macro_dev,3))])
baselineResults.append(['improved','test',str(round(p_quran_test,3)), str(round(r_quran_test,3)), str(round(f1_quran_test,3)), str(round(p_ot_test,3)), str(round(r_ot_test,3)), str(round(f1_ot_test,3)), str(round(p_nt_test,3)), str(round(r_nt_test,3)), str(round(f1_nt_test,3)), str(round(p_macro_test,3)), str(round(r_macro_test,3)), str(round(f1_macro_test,3))])

with open("classification.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(baselineResults)