In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import re
import nltk

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# IR Evaluation

In [3]:
relevant_docs = pd.read_csv('qrels.csv')
system_results = pd.read_csv('ttdssystemresults.csv')


In [4]:
def precision(retrieved, relevant):
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)
    return len(retrieved_set.intersection(relevant_set)) / len(retrieved_set)

def recall(retrieved, relevant):
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)
    return len(retrieved_set.intersection(relevant_set)) / len(relevant_set)

def average_precision(retrieved, relevant):
    precision_sum = 0
    for i in range(1, len(retrieved) + 1):
        if retrieved[i - 1] in relevant:
            precision_sum += precision(retrieved[:i], relevant)
    return precision_sum / len(relevant)

def nDCG(retrieved, relevant, relevance_scores):
    scores = {}
    for i in range(len(relevant)):
        scores[relevant[i]] = relevance_scores[i]
    retrieved_scores = []
    for doc in retrieved:
        if doc in scores:
            retrieved_scores.append(scores[doc])
        else:
            retrieved_scores.append(0)

    DCG = retrieved_scores[0]
    for i in range(1, len(retrieved_scores)):
        DCG += retrieved_scores[i] / np.log2(i + 1)
    retrieved_scores.sort(reverse=True)
    IDCG = retrieved_scores[0]
    for i in range(1, len(retrieved_scores)):
        IDCG += retrieved_scores[i] / np.log2(i + 1)
    if IDCG == 0:
        return 0
    return DCG / IDCG


In [5]:
query_ids = system_results['query_number'].unique()
systems = system_results['system_number'].unique()

results = pd.DataFrame(columns=['system_number', 'query_number', 'P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20'])

for system in systems:
    precision_10s = []
    recall_50s = []
    r_precisions = []
    aps = []
    nDCG_10s = []
    nDCG_20s = []
    for query_id in query_ids:
        relevant = relevant_docs[relevant_docs['query_id'] == query_id]['doc_id'].values
        relevance_scores = relevant_docs[relevant_docs['query_id'] == query_id]['relevance'].values
        retrieved = system_results[(system_results['query_number'] == query_id) & (system_results['system_number'] == system)].sort_values(by='rank_of_doc')['doc_number'].values

        precision_10 = precision(retrieved[:10], relevant)
        recall_50 = recall(retrieved[:50], relevant)
        r_precision = precision(retrieved[:len(relevant)], relevant)
        ap = average_precision(retrieved, relevant)
        nDCG_10 = nDCG(retrieved[:10], relevant, relevance_scores)
        nDCG_20 = nDCG(retrieved[:20], relevant, relevance_scores)
        
        results.loc[len(results)] = [int(system), int(query_id), precision_10, recall_50, r_precision, ap, nDCG_10, nDCG_20]

        precision_10s.append(precision_10)
        recall_50s.append(recall_50)
        r_precisions.append(r_precision)
        aps.append(ap)
        nDCG_10s.append(nDCG_10)
        nDCG_20s.append(nDCG_20)

        
    results.loc[len(results)] = [int(system), 'mean', np.mean(precision_10s), np.mean(recall_50s), np.mean(r_precisions), np.mean(aps), np.mean(nDCG_10s), np.mean(nDCG_20s)]


In [6]:
results['system_number'] = results['system_number'].astype(int)
results['query_number'] = results['query_number'].apply(lambda x: 'mean' if x == 'mean' else int(x))
results = results.round(3)
save_results = results.to_csv('ir_eval.csv', index=False)


In [7]:
mean_results = results[results['query_number'] == 'mean']
for column in ['P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20']:
    df = mean_results[['system_number', column]]
    df = df.set_index('system_number')
    df.sort_values(by=column, ascending=False, inplace=True)
    print("Best Model for", column, ":", df.index[0])

    best_model = df.index[0]
    second_best_model = df.index[1]
    best_model_value = df.loc[best_model][column]
    second_best_model_value = df.loc[second_best_model][column]
    # two tailed t-test
    print(best_model, "vs", second_best_model)
    print(best_model_value, "vs", second_best_model_value)
    t_statistic = (best_model_value - second_best_model_value) / np.sqrt((best_model_value * (1 - best_model_value) / 50) + (second_best_model_value * (1 - second_best_model_value) / 50))
    print("t-statistic:", t_statistic)
    print("p-value:", 2 * (1 - stats.t.cdf(t_statistic, 98)))
    print("")

    

Best Model for P@10 : 3
3 vs 5
0.41 vs 0.41
t-statistic: 0.0
p-value: 1.0

Best Model for R@50 : 2
2 vs 1
0.867 vs 0.834
t-statistic: 0.46322462441985235
p-value: 0.6442304510971641

Best Model for r-precision : 3
3 vs 6
0.448 vs 0.448
t-statistic: 0.0
p-value: 1.0

Best Model for AP : 3
3 vs 6
0.451 vs 0.445
t-statistic: 0.060328233925037125
p-value: 0.952017072315338

Best Model for nDCG@10 : 3
3 vs 6
0.592 vs 0.571
t-statistic: 0.21289482573886245
p-value: 0.8318512285899999

Best Model for nDCG@20 : 3
3 vs 1
0.584 vs 0.566
t-statistic: 0.18209000555013286
p-value: 0.8558882168842699



# Text Analysis

In [8]:
bible_and_quran = pd.read_csv("bible_and_quran.tsv", sep='\t', header=None)
bible_and_quran.columns = ['Source', 'Text']
ot, nt, quran = bible_and_quran[bible_and_quran['Source'] == 'OT'], bible_and_quran[bible_and_quran['Source'] == 'NT'], bible_and_quran[bible_and_quran['Source'] == 'Quran']

In [41]:
def tokenize(text):
    """
    Tokenize the text and return a list of words.
    Tokenisation is done by splitting the text, making it lowercase, and replacing any non-alphanumeric characters with spaces.
    :param text: The text to be tokenized
    :return: A list of words
    """
    tokens = text.split()
    words = [re.sub(r'[^a-zA-Z0-9]', ' ', token) for token in tokens]
    words = [word.lower() for word in words if word != '' or word != ' ']
    words = [word.strip() for word in words]
    return words

def remove_stopwords(words, stop_words_file="stop_words.txt"):
    """
    Remove stopwords from the list of words and return the filtered list.
    Stopwords are read from the file specified in the stop_words_file parameter.
    :param words: The list of words
    :param stop_words_file: The path to the file containing the stopwords
    :return: The filtered list of words
    """
    with open(stop_words_file, 'r') as f:
        stop_words = f.readlines()
    stop_words = [word.strip() for word in stop_words]
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

def stem(words):
    """
    Stem the words using the Porter stemmer and return the list of stemmed words.
    :param words: The list of words
    :return: The list of stemmed words
    """
    stemmer = nltk.stem.PorterStemmer()
    return [stemmer.stem(word) for word in words]

def preprocess(text):
    """
    Preprocess the text by tokenizing, removing stopwords, and stemming the words.
    :param text: The text to be preprocessed
    :param stopping: A boolean value indicating whether to remove stopwords or not
    :return: The list of preprocessed words
    """
    words = tokenize(text)
    words = remove_stopwords(words)
    words = stem(words)
    return words

def preprocess_corpus(corpus):
    """
    Preprocess the entire corpus by tokenizing, removing stopwords, and stemming the words.
    :param corpus: The corpus to be preprocessed
    :param stopping: A boolean value indicating whether to remove stopwords or not
    :return: The list of preprocessed words
    """
    preprocessed_corpus = []
    for doc in corpus['Text']:
        words = preprocess(doc)
        preprocessed_corpus.append(words)
    return preprocessed_corpus

In [42]:
ot_tokens = preprocess_corpus(ot)
nt_tokens = preprocess_corpus(nt)
quran_tokens = preprocess_corpus(quran)

In [52]:
# Compute Mutual Information

def mutual_information_and_chi_squared(word, corpus, other_corpuses):
    """
    Compute the mutual information of a word in two corpora.
    :param word: The word for which to compute the mutual information
    :param corpus1: The first corpus
    :param corpus2: The second corpus
    :return: The mutual information of the word in the two corpora
    """

    N_11 = 0
    N_10 = 0
    N_01 = 0
    N_00 = 0

    for doc in corpus:
        if word in doc:
            N_11 += 1
        else:
            N_01 += 1
    for doc in other_corpuses:
        if word in doc:
            N_10 += 1
        else:
            N_00 += 1

    N = N_11 + N_10 + N_01 + N_00
    if N_00 == 0 or N_01 == 0 or N_10 == 0 or N_11 == 0:
        return 0, 0

    mi = N_11 / N * np.log2(N * N_11 / ((N_10 + N_11) * (N_01 + N_11))) + \
        N_01 / N * np.log2(N * N_01 / ((N_00 + N_01) * (N_01 + N_11))) + \
        N_10 / N * np.log2(N * N_10 / ((N_10 + N_11) * (N_00 + N_10))) + \
        N_00 / N * np.log2(N * N_00 / ((N_00 + N_01) * (N_00 + N_10)))
    
    chi_squared = N * (N_11 * N_00 - N_10 * N_01) ** 2 / ((N_11 + N_01) * (N_11 + N_10) * (N_10 + N_00) * (N_01 + N_00))
    
    return mi, chi_squared

def top_mutual_information_chi_squared(corpus, other_corpuses):
    """
    Compute the top n words with the highest mutual information in two corpora.
    :param corpus1: The first corpus
    :param corpus2: The second corpus
    :param n: The number of words to return
    :return: A df containing the words and their mutual information and chi-squared values
    """
    df = pd.DataFrame(columns=['Word', 'Mutual Information', 'Chi-Squared'])
    words = set()
    for doc in corpus:
        new_words = set(doc)
        words = words.union(new_words)
    # for doc in other_corpuses:
    #     new_words = set(doc)
    #     words = words.union(new_words)
    print("Unique Tokens: " + str(len(words)))
    i = 0
    for word in words:
        mi, chi_squared = mutual_information_and_chi_squared(word, corpus, other_corpuses)
        df.loc[len(df)] = [word, mi, chi_squared]
        i += 1
        if i % 1000 == 0:
            print(str(i) + " tokens processed")
    return df

Unique Tokens: 7988
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed


# Topic Modelling (LDA with Gibbs Sampling)

In [79]:
from gensim import corpora
from gensim.models import LdaModel

text = ot_tokens + nt_tokens + quran_tokens

dictionary = corpora.Dictionary(text)
corpus = [dictionary.doc2bow(doc) for doc in text]

lda_model = LdaModel(corpus, num_topics=20, id2word=dictionary)


In [89]:
# For each corpus, compute the average score for each topic by summing the document-topic probability for each document in that corpus and dividing by the total number of documents in the corpus. 

corpuses = [ot_tokens, nt_tokens, quran_tokens]
corpus_names = ['OT', 'NT', 'Quran']
corpus_topics = dict()
for i in range(len(corpuses)):
    corpus = corpuses[i]
    corpus_name = corpus_names[i]
    corpus_topic = np.zeros(20)
    for doc in corpus:
        bow = dictionary.doc2bow(doc)
        doc_topics = lda_model.get_document_topics(bow)
        for topic in doc_topics:
            corpus_topic[topic[0]] += topic[1]
    corpus_topic /= len(corpus)
    corpus_topics[corpus_name] = corpus_topic
    



In [90]:
# top 3 topics for each corpus
for corpus_name in corpus_names:
    print("Top 3 topics for", corpus_name)
    topics = corpus_topics[corpus_name]
    top_topics = np.argsort(topics)[::-1][:3]
    for topic in top_topics:
        print("Topic", topic, ":", lda_model.print_topic(topic))
    print("")

Top 3 topics for OT
Topic 0 : 0.103*"god" + 0.080*"lord" + 0.049*"truth" + 0.044*"word" + 0.040*"suffer" + 0.029*"good" + 0.028*"merci" + 0.026*"evil" + 0.026*"peopl" + 0.025*"righteous"
Topic 6 : 0.074*"god" + 0.062*"peopl" + 0.051*"lord" + 0.026*"guid" + 0.024*"honor" + 0.018*"dwell" + 0.016*"mountain" + 0.016*"abraham" + 0.016*"deni" + 0.016*"fear"
Topic 19 : 0.218*"god" + 0.041*"earth" + 0.039*"heaven" + 0.038*"lord" + 0.036*"judgment" + 0.027*"love" + 0.023*"day" + 0.022*"soul" + 0.021*"live" + 0.020*"peopl"

Top 3 topics for NT
Topic 0 : 0.103*"god" + 0.080*"lord" + 0.049*"truth" + 0.044*"word" + 0.040*"suffer" + 0.029*"good" + 0.028*"merci" + 0.026*"evil" + 0.026*"peopl" + 0.025*"righteous"
Topic 19 : 0.218*"god" + 0.041*"earth" + 0.039*"heaven" + 0.038*"lord" + 0.036*"judgment" + 0.027*"love" + 0.023*"day" + 0.022*"soul" + 0.021*"live" + 0.020*"peopl"
Topic 7 : 0.102*"messeng" + 0.047*"power" + 0.043*"death" + 0.035*"thing" + 0.029*"belong" + 0.025*"end" + 0.025*"earth" + 0.020

# Text Classification

In [None]:
twitter_file = "train.txt"

with 