In [433]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import re
import nltk
import collections
import re
import string
import sklearn
import scipy
from scipy import sparse
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import classification_report

# IR Evaluation

In [None]:
def task_1():
    relevant_docs = pd.read_csv('qrels.csv')
    system_results = pd.read_csv('ttdssystemresults.csv')


In [283]:
relevant_docs = pd.read_csv('qrels.csv')
system_results = pd.read_csv('ttdssystemresults.csv')


In [284]:
def precision(retrieved, relevant):
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)
    return len(retrieved_set.intersection(relevant_set)) / len(retrieved_set)

def recall(retrieved, relevant):
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)
    return len(retrieved_set.intersection(relevant_set)) / len(relevant_set)

def average_precision(retrieved, relevant):
    precision_sum = 0
    for i in range(1, len(retrieved) + 1):
        if retrieved[i - 1] in relevant:
            precision_sum += precision(retrieved[:i], relevant)
    return precision_sum / len(relevant)

def nDCG(retrieved, relevant, relevance_scores):
    scores = {}
    for i in range(len(relevant)):
        scores[relevant[i]] = relevance_scores[i]
    retrieved_scores = []
    for doc in retrieved:
        if doc in scores:
            retrieved_scores.append(scores[doc])
        else:
            retrieved_scores.append(0)

    DCG = retrieved_scores[0]
    for i in range(1, len(retrieved_scores)):
        DCG += retrieved_scores[i] / np.log2(i + 1)
    retrieved_scores.sort(reverse=True)
    IDCG = retrieved_scores[0]
    for i in range(1, len(retrieved_scores)):
        IDCG += retrieved_scores[i] / np.log2(i + 1)
    if IDCG == 0:
        return 0
    
    print("DCG", DCG)
    print("IDCG", IDCG)
    return DCG / IDCG


In [285]:
query_ids = system_results['query_number'].unique()
systems = system_results['system_number'].unique()

results = pd.DataFrame(columns=['system_number', 'query_number', 'P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20'])

for system in systems:
    print("Processing system", system)
    precision_10s = []
    recall_50s = []
    r_precisions = []
    aps = []
    nDCG_10s = []
    nDCG_20s = []
    for query_id in query_ids:
        print("Query ID", query_id)
        relevant = relevant_docs[relevant_docs['query_id'] == query_id]['doc_id'].values
        relevance_scores = relevant_docs[relevant_docs['query_id'] == query_id]['relevance'].values
        retrieved = system_results[(system_results['query_number'] == query_id) & (system_results['system_number'] == system)].sort_values(by='rank_of_doc')['doc_number'].values

        precision_10 = precision(retrieved[:10], relevant)
        recall_50 = recall(retrieved[:50], relevant)
        r_precision = precision(retrieved[:len(relevant)], relevant)
        ap = average_precision(retrieved, relevant)
        nDCG_10 = nDCG(retrieved[:10], relevant, relevance_scores)
        nDCG_20 = nDCG(retrieved[:20], relevant, relevance_scores)
        
        results.loc[len(results)] = [int(system), int(query_id), precision_10, recall_50, r_precision, ap, nDCG_10, nDCG_20]

        precision_10s.append(precision_10)
        recall_50s.append(recall_50)
        r_precisions.append(r_precision)
        aps.append(ap)
        nDCG_10s.append(nDCG_10)
        nDCG_20s.append(nDCG_20)

        
    results.loc[len(results)] = [int(system), 'mean', np.mean(precision_10s), np.mean(recall_50s), np.mean(r_precisions), np.mean(aps), np.mean(nDCG_10s), np.mean(nDCG_20s)]


Processing system 1
Query ID 1
DCG 2.997147735133648
IDCG 6.7618595071429155
DCG 2.997147735133648
IDCG 6.7618595071429155
Query ID 2
DCG 1.7317065537373744
IDCG 2.6309297535714578
DCG 3.855081630315694
IDCG 7.268929392892205
Query ID 3
DCG 0.6941346394792774
IDCG 3.0
Query ID 4
DCG 5.902918195508732
IDCG 8.579388872450851
DCG 8.995617437316119
IDCG 12.803884063230115
Query ID 5
DCG 1.0177825608059992
IDCG 2.0
DCG 1.2575950273741305
IDCG 2.6309297535714578
Query ID 6
DCG 6.508353076911336
IDCG 8.304666305987414
DCG 8.320363096230484
IDCG 11.084361790882394
Query ID 7
DCG 3.3010299956639813
IDCG 4.0
DCG 3.3010299956639813
IDCG 4.0
Query ID 8
DCG 6.018590298918789
IDCG 7.57938887245085
DCG 6.956195252178463
IDCG 9.768929392892208
Query ID 9
DCG 6.6137831393950375
IDCG 8.584394269677935
DCG 9.910340050543956
IDCG 13.294270125579557
Query ID 10
DCG 1.1821938260239127
IDCG 5.0
Processing system 2
Query ID 1
DCG 0.6666666666666666
IDCG 2.0
DCG 1.2245525579689263
IDCG 4.0
Query ID 2
DCG 0.830

In [286]:
results['system_number'] = results['system_number'].astype(int)
results['query_number'] = results['query_number'].apply(lambda x: 'mean' if x == 'mean' else int(x))
results = results.round(3)
save_results = results.to_csv('ir_eval.csv', index=False)


In [287]:
mean_results = results[results['query_number'] == 'mean']
for column in ['P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20']:
    df = mean_results[['system_number', column]]
    df = df.set_index('system_number')
    df.sort_values(by=column, ascending=False, inplace=True)
    print("Best Model for", column, ":", df.index[0])

    best_model = df.index[0]
    second_best_model = df.index[1]
    best_model_value = df.loc[best_model][column]
    second_best_model_value = df.loc[second_best_model][column]
    # two tailed t-test
    print(best_model, "vs", second_best_model)
    print(best_model_value, "vs", second_best_model_value)
    t_statistic = (best_model_value - second_best_model_value) / np.sqrt((best_model_value * (1 - best_model_value) / 50) + (second_best_model_value * (1 - second_best_model_value) / 50))
    print("t-statistic:", t_statistic)
    print("p-value:", 2 * (1 - stats.t.cdf(t_statistic, 98)))
    print("")

    

Best Model for P@10 : 3
3 vs 5
0.41 vs 0.41
t-statistic: 0.0
p-value: 1.0

Best Model for R@50 : 2
2 vs 1
0.867 vs 0.834
t-statistic: 0.46322462441985235
p-value: 0.6442304510971641

Best Model for r-precision : 3
3 vs 6
0.448 vs 0.448
t-statistic: 0.0
p-value: 1.0

Best Model for AP : 3
3 vs 6
0.451 vs 0.445
t-statistic: 0.060328233925037125
p-value: 0.952017072315338

Best Model for nDCG@10 : 3
3 vs 6
0.592 vs 0.571
t-statistic: 0.21289482573886245
p-value: 0.8318512285899999

Best Model for nDCG@20 : 3
3 vs 1
0.584 vs 0.566
t-statistic: 0.18209000555013286
p-value: 0.8558882168842699



# Text Analysis

In [288]:
bible_and_quran_file = 'bible_and_quran.tsv'
bible_and_quran = pd.DataFrame(columns=['Source', 'Text'])
for line in open(bible_and_quran_file):
    source, text = line.split('\t', 1)
    bible_and_quran.loc[len(bible_and_quran)] = [source, text]
ot, nt, quran = bible_and_quran[bible_and_quran['Source'] == 'OT'], bible_and_quran[bible_and_quran['Source'] == 'NT'], bible_and_quran[bible_and_quran['Source'] == 'Quran']

In [289]:
# def tokenize(text):
#     """
#     Tokenize the text and return a list of words.
#     Tokenisation is done by splitting the text, making it lowercase, and replacing any non-alphanumeric characters with spaces.
#     :param text: The text to be tokenized
#     :return: A list of words
#     """
#     tokens = text.split()
#     words = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens]
#     words = [word.lower() for word in words if word != '' or word != ' ']
#     words = [word.strip() for word in words]
#     return words

def remove_stopwords(words, stop_words_file="stop_words.txt"):
    """
    Remove stopwords from the list of words and return the filtered list.
    Stopwords are read from the file specified in the stop_words_file parameter.
    :param words: The list of words
    :param stop_words_file: The path to the file containing the stopwords
    :return: The filtered list of words
    """
    with open(stop_words_file, 'r') as f:
        stop_words = f.readlines()
    stop_words = [word.strip() for word in stop_words]
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

def stem(words):
    """
    Stem the words using the Porter stemmer and return the list of stemmed words.
    :param words: The list of words
    :return: The list of stemmed words
    """
    stemmer = nltk.stem.PorterStemmer()
    return [stemmer.stem(word) for word in words]

def preprocess(text, stop_words_file="stop_words.txt"):
    """
    Preprocess the text by tokenizing, removing stopwords, and stemming the words.
    :param text: The text to be preprocessed
    :param stopping: A boolean value indicating whether to remove stopwords or not
    :return: The list of preprocessed words
    """
    with open(stop_words_file, 'r') as f:
        stop_words = f.readlines()
    stop_words = [word.strip() for word in stop_words]

    tokens = text.split()
    words = []
    stemmer = nltk.stem.PorterStemmer()
    for token in tokens:
        token = token.strip()
        token = token.lower()
        original_token = token
        if token in stop_words:
            continue
        token = re.sub(r'[^a-zA-Z0-9]', ' ', token)
        if token == '' or token == ' ':
            continue
        token = token.split()
        for t in token:
            if t in stop_words:
                continue
            t = stemmer.stem(t)
            t = t.strip()
            if t == 'of' or t == 'all':
                print(original_token, t)
            words.append(t)

    # tokens = [token.strip() for token in tokens]
    # tokens = [token.lower() for token in tokens]
    # tokens = [token for token in tokens if token not in stop_words]
    # words = [re.sub(r'[^a-zA-Z0-9]', ' ', token) for token in tokens]
    # words = [word for word in words if word != '' or word != ' ']
    # stemmer = nltk.stem.PorterStemmer()
    # words = [stemmer.stem(word) for word in words]
    # words = [word.strip() for word in words]
    # # split words where word.split() is greater than 1
    # words = [word.split() for word in words]
    # words = [word for sublist in words for word in sublist]
    return words

def preprocess_corpus(corpus):
    """
    Preprocess the entire corpus by tokenizing, removing stopwords, and stemming the words.
    :param corpus: The corpus to be preprocessed
    :param stopping: A boolean value indicating whether to remove stopwords or not
    :return: The list of preprocessed words
    """
    preprocessed_corpus = []
    for doc in corpus['Text']:
        words = preprocess(doc)
        preprocessed_corpus.append(words)
    return preprocessed_corpus

In [290]:
ot_tokens = preprocess_corpus(ot)
nt_tokens = preprocess_corpus(nt)
quran_tokens = preprocess_corpus(quran)

In [291]:
# print number of unique tokens in each corpus
print("OT Unique Tokens:", len(set([word for doc in ot_tokens for word in doc])))
print("NT Unique Tokens:", len(set([word for doc in nt_tokens for word in doc])))
print("Quran Unique Tokens:", len(set([word for doc in quran_tokens for word in doc])))

OT Unique Tokens: 7059
NT Unique Tokens: 3564
Quran Unique Tokens: 3044


In [12]:
# Compute Mutual Information

def mutual_information_and_chi_squared(word, corpus, other_corpuses):
    """
    Compute the mutual information of a word in two corpora.
    :param word: The word for which to compute the mutual information
    :param corpus1: The first corpus
    :param corpus2: The second corpus
    :return: The mutual information of the word in the two corpora
    """

    N_11 = 0
    N_10 = 0
    N_01 = 0
    N_00 = 0

    for doc in corpus:
        if word in doc:
            N_11 += 1
        else:
            N_01 += 1
    for doc in other_corpuses:
        if word in doc:
            N_10 += 1
        else:
            N_00 += 1
    N = N_11 + N_10 + N_01 + N_00

    chi_squared = N * (N_11 * N_00 - N_10 * N_01) ** 2 / ((N_11 + N_01) * (N_11 + N_10) * (N_10 + N_00) * (N_01 + N_00))

    for n in [N_11, N_10, N_01, N_00]:
        if n == 0:
            n = 1

    # invalid_MI = False
    # invalid_X2 = False
    # if (N_11 + N_01) * (N_11 + N_10) * (N_10 + N_00) * (N_01 + N_00) == 0:
    #     print("Word", word, "Invalid denominator X2")
    #     invalid_X2 = True
    
    # if N * np.log2(N * N_11 / ((N_10 + N_11) * (N_01 + N_11))) == 0 or N * np.log2(N * N_01 / ((N_00 + N_01) * (N_01 + N_11))) == 0 or N * np.log2(N * N_10 / ((N_10 + N_11) * (N_00 + N_10))) == 0 or N * np.log2(N * N_00 / ((N_00 + N_01) * (N_00 + N_10)) == 0):
    #     print("Word", word, "Invalid denominator MI")
    #     invalid_MI = True

    # if not invalid_MI:
    mi = 0
    if ((N_10 + N_11) * (N_01 + N_11)) != 0 and N_11 != 0:
        mi += N_11 / N * np.log2(N * N_11 / ((N_10 + N_11) * (N_01 + N_11)))

    if ((N_00 + N_01) * (N_01 + N_11)) != 0 and N_01 != 0:
        mi += N_01 / N * np.log2(N * N_01 / ((N_00 + N_01) * (N_01 + N_11)))
    
    if ((N_10 + N_11) * (N_00 + N_10)) != 0 and N_10 != 0:
        mi += N_10 / N * np.log2(N * N_10 / ((N_10 + N_11) * (N_00 + N_10)))

    if ((N_00 + N_01) * (N_00 + N_10)) != 0 and N_00 != 0:
        mi += N_00 / N * np.log2(N * N_00 / ((N_00 + N_01) * (N_00 + N_10)))

    return mi, chi_squared

def top_mutual_information_chi_squared(corpus, other_corpuses):
    """
    Compute the top n words with the highest mutual information in two corpora.
    :param corpus1: The first corpus
    :param corpus2: The second corpus
    :param n: The number of words to return
    :return: A df containing the words and their mutual information and chi-squared values
    """
    df = pd.DataFrame(columns=['Word', 'Mutual Information', 'Chi-Squared'])
    words = set()
    for doc in corpus:
        new_words = set(doc)
        words = words.union(new_words)
        
    for doc in other_corpuses:
        new_words = set(doc)
        words = words.union(new_words)
    print("Unique Tokens: " + str(len(words)))
    i = 0
    for word in words:
        mi, chi_squared = mutual_information_and_chi_squared(word, corpus, other_corpuses)
        df.loc[len(df)] = [word, mi, chi_squared]
        i += 1
        if i % 1000 == 0:
            print(str(i) + " tokens processed")
    return df

In [13]:
ot_top_mi_chi = top_mutual_information_chi_squared(ot_tokens, nt_tokens+quran_tokens)
ot_top_mi_chi.sort_values(by='Mutual Information', ascending=False, inplace=True)
ot_top_mi_chi.to_csv('ot_mi_x2.csv', index=False)

nt_top_mi_chi = top_mutual_information_chi_squared(nt_tokens, ot_tokens+quran_tokens)
nt_top_mi_chi.sort_values(by='Mutual Information', ascending=False, inplace=True)
nt_top_mi_chi.to_csv('nt_mi_x2.csv', index=False)

quran_top_mi_chi = top_mutual_information_chi_squared(quran_tokens, ot_tokens+nt_tokens)
quran_top_mi_chi.sort_values(by='Mutual Information', ascending=False, inplace=True)
quran_top_mi_chi.to_csv('quran_mi_x2.csv', index=False)



Unique Tokens: 8770
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
Unique Tokens: 8770
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed
Unique Tokens: 8770
1000 tokens processed
2000 tokens processed
3000 tokens processed
4000 tokens processed
5000 tokens processed
6000 tokens processed
7000 tokens processed
8000 tokens processed


# Topic Modelling (LDA with Gibbs Sampling)

In [304]:
from gensim import corpora
from gensim.models import LdaModel

corpuses = [ot_tokens, nt_tokens, quran_tokens]
corpus_names = ['OT', 'NT', 'Quran']

corpus_topics = dict()

for i, tokens in enumerate(corpuses):
    corpus_name = corpus_names[i]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(doc) for doc in tokens]

    lda_model = LdaModel(corpus, num_topics=20, id2word=dictionary, random_state=0)

    corpus_topic = np.zeros(20)
    for doc in corpus:
        doc_topics = lda_model.get_document_topics(doc)
        for topic in doc_topics:
            corpus_topic[topic[0]] += topic[1]
    corpus_topic /= len(corpus)
    corpus_topics[corpus_name] = corpus_topic
    print(sorted(corpus_topic))
    top_topic_id = np.argmax(corpus_topic)  # Find the topic with the highest average probability
    top_topic_words = lda_model.show_topic(top_topic_id, topn=10)  # Get the top 10 words for the topic
    print("Top topic for", corpus_name, ":", top_topic_words)


[0.025970344195750714, 0.02944551606929061, 0.029946549603796085, 0.03525365916089115, 0.03646866586096108, 0.03859654931933854, 0.03905371896325925, 0.04146654089943449, 0.0422650654090734, 0.04238769935224599, 0.0448134472608196, 0.044892878140906166, 0.04660748152243843, 0.047216628528438956, 0.05233352565735171, 0.05304805918907126, 0.06139679522564193, 0.06953127616559877, 0.07310455922808039, 0.07385739091329717]
Top topic for OT : [('lord', 0.16542305), ('god', 0.0724324), ('host', 0.06843008), ('fear', 0.021531954), ('great', 0.019137824), ('peopl', 0.019019764), ('day', 0.018573204), ('soul', 0.017768467), ('love', 0.016425587), ('prais', 0.01592431)]
[0.034804253613589915, 0.037534821666049045, 0.03903927981352729, 0.03912483003693429, 0.04040686863540547, 0.04070485170621056, 0.04080674525049669, 0.04120271362946238, 0.04232738942564732, 0.044172785695614995, 0.04473229431436178, 0.04555019513987784, 0.04622900902926901, 0.05020829213178676, 0.051671059338739445, 0.052088191

# Text Classification

In [414]:
import emoji

def replace_emojis_with_text(text):
    return emoji.demojize(text)

def get_tokens_high_chi_squared(ratio):
    valid_tokens = []
    for file in ['negative_mi_x2.csv', 'positive_mi_x2.csv', 'neutral_mi_x2.csv']:
        df = pd.read_csv(file)
        df.sort_values(by='Chi-Squared', ascending=False, inplace=True)
        for i in range(int(len(df) * ratio)):
            valid_tokens.append(df.iloc[i]['Word'])
        
    return valid_tokens

def preprocess_data(data, chi_ratio=0.5, extras=True):
    
    chars_to_remove = re.compile(f'[{string.punctuation}]')
    documents = []
    categories = []
    lines = data.split('\n')
    valid_tokens = get_tokens_high_chi_squared(chi_ratio)
    # Remove the header
    lines = lines[1:]

    #Shuffle lines
    # np.random.seed(0)
    # np.random.shuffle(lines)

    for line in lines:
        # make a dictionary for each document
        # word_id -> count (could also be tf-idf score, etc.)
        line = line.strip()
        if line:
            # split on tabs, we have 3 columns in this tsv format file
            tweet_id, category, tweet = line.split('\t')

            if extras:
                #Remove links
                tweet = re.sub(r'http\S+', '', tweet)

            stop_words_file = "stop_words.txt"
            with open(stop_words_file, 'r') as f:
                stop_words = f.readlines()
            stop_words = [word.strip() for word in stop_words]

            #Duplicate hashtags
            if extras:
                for word in tweet.split():
                    if word.startswith("#"):
                        tweet += " " + word[1:]

            if extras:
                #Replace emojis with text
                tweet = replace_emojis_with_text(tweet)



            # process the words
            words = tweet.lower().split()
            if extras:
                words = [word for word in words if word not in stop_words]
            #replace hypens and underscores with spaces
            words = [word.replace('-', ' ').replace('_', ' ') for word in words]
            words = chars_to_remove.sub('',tweet).lower().split()
            
            if extras and chi_ratio != 1.0:
                words = stem(words)
                # remove words with low chi-squared values
                words = [word for word in words if word in valid_tokens]

            # add the list of words to the documents list
            documents.append(words)
            # add the category to the categories list
            categories.append(category)
            
    return documents, categories

In [415]:
# twitter_file = "train.txt"
# with open(twitter_file, 'r', encoding='utf-8') as f:
#     data = f.read()
# documents, categories = preprocess_data(data)

In [416]:
# build a BOW representation of the files: use the scipy 
# data is the preprocessed_data
# word2id maps words to their ids
def convert_to_bow_matrix(preprocessed_data, word2id):
    
    # matrix size is number of docs x vocab size + 1 (for OOV)
    matrix_size = (len(preprocessed_data),len(word2id)+1)
    oov_index = len(word2id)
    # matrix indexed by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)

    # iterate through all documents in the dataset
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            # default is 0, so just add to the count for this word in this doc
            # if the word is oov, increment the oov_index
            X[doc_id,word2id.get(word,oov_index)] += 1
            
    return X



In [417]:
def get_train_test_data(train_file, test_file, dev_ratio, chi_ratio=0.5, extras=True):
    with open(train_file, 'r', encoding='utf-8') as f:
        train_data = f.read()
    with open(test_file, 'r', encoding='utf-8') as f:
        test_data = f.read()
    
    train_documents, train_categories = preprocess_data(train_data, chi_ratio, extras)
    test_documents, test_categories = preprocess_data(test_data, chi_ratio, extras)

    train_documents, dev_documents = train_documents[:int(len(train_documents) * (1-dev_ratio))], train_documents[int(len(train_documents) * (1-dev_ratio)):]
    train_categories, dev_categories = train_categories[:int(len(train_categories) * (1-dev_ratio))], train_categories[int(len(train_categories) * (1-dev_ratio)):]

    print("Train size:", len(train_documents))
    print("Dev size:", len(dev_documents))
    print("Test size:", len(test_documents))

    train_vocab = set()
    for doc in train_documents:
        for word in doc:
            train_vocab.add(word)

    word2id = {}
    for word_id,word in enumerate(train_vocab):
        word2id[word] = word_id
    
    cat2id = {}
    for cat_id,cat in enumerate(set(train_categories)):
        cat2id[cat] = cat_id



    X_train = convert_to_bow_matrix(train_documents, word2id)
    y_train = [cat2id[cat] for cat in train_categories]

    X_dev = convert_to_bow_matrix(dev_documents, word2id)
    y_dev = [cat2id[cat] for cat in dev_categories]

    X_test = convert_to_bow_matrix(test_documents, word2id)
    y_test = [cat2id[cat] for cat in test_categories]

    return X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id

In [418]:
# # Compute mutual information and chi-squared for the twitter dataset

# training_data = pd.read_csv('train.txt', sep='\t', header=None)
# training_data.columns = ['tweet_id', 'category', 'tweet']
# training_data = training_data[1:]
# training_data['tweet'] = training_data['tweet'].apply(preprocess)
# training_data

In [419]:
# token_counts = dict()
# for tweet in training_data['tweet']:
#     for token in tweet:
#         if token in token_counts:
#             token_counts[token] += 1
#         else:
#             token_counts[token] = 1

# # Remove tokens that appear less than 10 times

# for tweet in training_data['tweet']:
#     for token in tweet:
#         if token_counts[token] < 10:
#             tweet.remove(token)



In [420]:
# negative_tweets = training_data[training_data['category'] == 'negative']
# positive_tweets = training_data[training_data['category'] == 'positive']
# neutral_tweets = training_data[training_data['category'] == 'neutral']

# negative_tokens = []
# for tweet in negative_tweets['tweet']:
#     negative_tokens.append(tweet)
# positive_tokens = []
# for tweet in positive_tweets['tweet']:
#     positive_tokens.append(tweet)
# neutral_tokens = []
# for tweet in neutral_tweets['tweet']:
#     neutral_tokens.append(tweet)

# # top_mutual_information_chi_squared(negative_tokens, positive_tokens+neutral_tokens).to_csv('negative_mi_x2.csv', index=False)
# top_mutual_information_chi_squared(positive_tokens, negative_tokens+neutral_tokens).to_csv('positive_mi_x2.csv', index=False)
# top_mutual_information_chi_squared(neutral_tokens, negative_tokens+positive_tokens).to_csv('neutral_mi_x2.csv', index=False)

In [421]:
# for file in ['negative_mi_x2.csv', 'positive_mi_x2.csv', 'neutral_mi_x2.csv']:
#     df = pd.read_csv(file)
#     df.sort_values(by='Mutual Information', ascending=False, inplace=True)
#     print("Top 10 words for MI", file)
#     print(df.head(10))
#     df.sort_values(by='Chi-Squared', ascending=False, inplace=True)
#     print("Top 10 words for Chi-Squared", file)
#     print(df.head(10))
#     print("")



In [422]:
# y_train_predictions = model.predict(X_train)

# now can compute any metrics we care about. Let's quickly do accuracy
def compute_accuracy(predictions, true_values):
    num_correct = 0
    num_total = len(predictions)
    for predicted,true in zip(predictions,true_values):
        if predicted==true:
            num_correct += 1
    return num_correct / num_total

# accuracy = compute_accuracy(y_train_predictions,y_train)
# print("Accuracy:",accuracy)

In [423]:

# y_test_predictions = model.predict(X_test)
# cat_names = []
# for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
#     cat_names.append(cat)
# print(classification_report(y_test, y_test_predictions, target_names=cat_names))

In [424]:
# model = sklearn.ensemble.RandomForestClassifier()
# model.fit(X_train,y_train)

# y_train_predictions = model.predict(X_train)
# print("Train accuracy was:",compute_accuracy(y_train_predictions,y_train))
# y_test_predictions = model.predict(X_test)
# print("Test accuracy was:",compute_accuracy(y_test_predictions,y_test))

In [430]:
def get_model_accuracy(model, X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id):
    model.fit(X_train,y_train)
    y_train_predictions = model.predict(X_train)
    train_accuracy = compute_accuracy(y_train_predictions,y_train)
    y_dev_predictions = model.predict(X_dev)
    dev_accuracy = compute_accuracy(y_dev_predictions,y_dev)
    y_test_predictions = model.predict(X_test)
    test_accuracy = compute_accuracy(y_test_predictions,y_test)

    print("Train accuracy was:", train_accuracy)
    print("Dev accuracy was:", dev_accuracy)
    print("Test accuracy was:", test_accuracy)

    classification_reports = {}
    
    classification_reports['train'] = classification_report(y_train, y_train_predictions, target_names=cat2id.keys(), output_dict=True)
    classification_reports['dev'] = classification_report(y_dev, y_dev_predictions, target_names=cat2id.keys(), output_dict=True)
    classification_reports['test'] = classification_report(y_test, y_test_predictions, target_names=cat2id.keys(), output_dict=True)

    # Print misclassified examples

    file = 'ttds_2024_cw2_test.txt'
    with open(file, 'r', encoding='utf-8') as f:
        data = f.read()
    test_documents, test_categories = preprocess_data(data, 1.0, True)

    for i in range(len(y_test)):
        if y_test[i] != y_test_predictions[i]:
            print("Predicted:", y_test_predictions[i], "Actual:", test_categories[i])
            print(test_documents[i])
            print("")

    return classification_reports

    

# model_list = [sklearn.svm.SVC(C=1000, kernel ="linear"), sklearn.ensemble.RandomForestClassifier(), sklearn.ensemble.GradientBoostingClassifier(), sklearn.linear_model.LogisticRegression(), sklearn.naive_bayes.MultinomialNB(), sklearn.naive_bayes.BernoulliNB(), sklearn.neighbors.KNeighborsClassifier(), sklearn.tree.DecisionTreeClassifier(), sklearn.neural_network.MLPClassifier()]
# model_names = ['SVM', 'Random Forest', 'Gradient Boosting', 'Logistic Regression', 'Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'K-Nearest Neighbors', 'Decision Tree', 'MLP']

# train_accuracies = []
# test_accuracies = []

# for model in model_list:
#     train_accuracy, test_accuracy = get_model_accuracy(model, X_train, y_train, X_test, y_test)
#     train_accuracies.append(train_accuracy)
#     test_accuracies.append(test_accuracy)
#     print("Model:", model)
#     print("Train accuracy was:", train_accuracy)
#     print("Test accuracy was:", test_accuracy)
#     print("")

In [431]:
# # Logistic regression parameter sweep

# X_train, y_train, X_test, y_test = get_train_test_data("train.txt", 0.9)

# param_grid = [
#     {'classifier__penalty' : ['l1', 'l2'],
#     'classifier__C' : np.logspace(-4, 4, 20),
#     'classifier__solver' : ['sage', 'liblinear'],
#     'classifier__max_iter' : [100, 1000, 10000],
#     'classifier__multi_class' : ['auto', 'ovr', 'multinomial']}
# ]

# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.linear_model import LogisticRegression

# pipeline = Pipeline([
#     ('classifier', LogisticRegression())
# ])

# grid_search = GridSearchCV(pipeline, param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)
# print("Best estimator:", grid_search.best_estimator_)
# print("")





In [432]:
X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id = get_train_test_data("train.txt", "ttds_2024_cw2_test.txt", 0.1, 1.0, False)

model = sklearn.linear_model.LogisticRegression(C=0.615848211066026, max_iter=100, multi_class='auto', penalty='l1', solver='liblinear')
improved_results = get_model_accuracy(model, X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id)


Train size: 16781
Dev size: 1865
Test size: 4662
Train accuracy was: 0.7487634825099815
Dev accuracy was: 0.6289544235924933
Test accuracy was: 0.6467181467181468
Predicted: 1 Actual: neutral
['lets', 'watch', 'this', 'fantasticbeasts', 'fantasticbeasts']

Predicted: 2 Actual: neutral
['idubbbz', 'elizabethforma', 'if', 'you', 'dont', 'think', 'this', 'is', 'how', 'a', 'president', 'trump', 'would', 'say', 'my', 'horny', 'level', 'is', '7']

Predicted: 0 Actual: positive
['randy', 'orton', 'vs', 'sheamus', 'to', 'start', 'the', 'show', 'interesting', 'choice', 'crowd', 'may', 'turn', 'early']

Predicted: 1 Actual: neutral
['released', 'friday', 'a', 'love', 'song', 'to', 'the', 'earth', 'by', 'paul', 'mccartney', 'jon', 'bon', 'jovi', 'sheryl', 'crow', 'fergie', 'sean', 'paul']

Predicted: 1 Actual: neutral
['yourself', 'into', 'it', 'by', 'listening', 'to', 'kpop', 'kculture', 'comes', 'with', 'it', 'you', 'may', 'not', 'agree', 'with', 'their', 'culture', 'and', 'thats', 'fine', 'as'

ValueError: X has 1 features, but LogisticRegression is expecting 39507 features as input.

ValueError: X has 1 features, but LogisticRegression is expecting 39507 features as input.

In [None]:
# chi_ratios = [0.01, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 1.0]

# for chi_ratio in chi_ratios:
#     print("Chi Ratio:", chi_ratio)
#     X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id = get_train_test_data("train.txt", "ttds_2024_cw2_test.txt", 0.1, chi_ratio, True)
#     model = sklearn.linear_model.LogisticRegression(C=0.615848211066026, max_iter=100, multi_class='auto', penalty='l1', solver='liblinear')
#     results = get_model_accuracy(model, X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id)
#     print("Train accuracy was:", results['train']['accuracy'])
#     print("Dev accuracy was:", results['dev']['accuracy'])
#     print("Test accuracy was:", results['test']['accuracy'])
#     print("")

In [401]:
# Baseline Model
# X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id = get_train_test_data("train.txt", "ttds_2024_cw2_test.txt", 0.1, 1.0, False)
model = sklearn.svm.SVC(C=1000, kernel ="linear")
baseline_results = get_model_accuracy(model, X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id)

Train accuracy was: 0.9992253143436028
Dev accuracy was: 0.6053619302949061
Test accuracy was: 0.5791505791505791


In [350]:
def convert_results_to_csv(baseline_classification_reports, improved_classification_reports):
    # system,split,p-pos,r-pos,f-pos,p-neg,r-neg,f-neg,p-neu,r-neu,f-neu,p-macro,r-macro,f-macro
    columns = ['system', 'split', 'p-pos', 'r-pos', 'f-pos', 'p-neg', 'r-neg', 'f-neg', 'p-neu', 'r-neu', 'f-neu', 'p-macro', 'r-macro', 'f-macro']
    results = pd.DataFrame(columns=columns)

    for system in ['baseline', 'improved']:
        if system == 'baseline':
            classification_reports = baseline_classification_reports
        else:
            classification_reports = improved_classification_reports
        for split in ['train', 'dev', 'test']:
            class_report = classification_reports[split]

            p_pos = class_report['positive']['precision']
            r_pos = class_report['positive']['recall']
            f_pos = class_report['positive']['f1-score']

            p_neg = class_report['negative']['precision']
            r_neg = class_report['negative']['recall']
            f_neg = class_report['negative']['f1-score']

            p_neu = class_report['neutral']['precision']
            r_neu = class_report['neutral']['recall']
            f_neu = class_report['neutral']['f1-score']

            p_macro = class_report['macro avg']['precision']
            r_macro = class_report['macro avg']['recall']
            f_macro = class_report['macro avg']['f1-score']

            results.loc[len(results)] = [system, split, p_pos, r_pos, f_pos, p_neg, r_neg, f_neg, p_neu, r_neu, f_neu, p_macro, r_macro, f_macro]

    # Round to 3 decimal places
    results = results.round(3)
    results.to_csv('classification.csv', index=False)        

convert_results_to_csv(baseline_results, improved_results)

In [279]:
model = sklearn.ensemble.RandomForestClassifier()
rf_results = get_model_accuracy(model, X_train, y_train, X_dev, y_dev, X_test, y_test, cat2id)

Train accuracy was: 0.9994636791609558
Dev accuracy was: 0.586058981233244
Test accuracy was: 0.5913770913770914


In [43]:
model = sklearn.linear_model.LogisticRegression(C=0.615848211066026, penalty='l1', solver='liblinear')
train_accuracy, test_accuracy = get_model_accuracy(model, X_train, y_train, X_test, y_test)
print("Train accuracy was:", train_accuracy)
print("Test accuracy was:", test_accuracy)

macro_f1 = sklearn.metrics.f1_score(y_test, y_test_predictions, average='macro')
print("Macro F1:", macro_f1)

Train accuracy was: 0.7498957153924081
Test accuracy was: 0.6423592493297587
Macro F1: 0.6198487352114427


In [60]:
new_features = np.zeros((X_train.shape[0], 1))
X_train_new = scipy.sparse.hstack([X_train, new_features])

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'AutomodellTokenizer' from 'transformers' (c:\Users\adamg\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\__init__.py)

In [70]:
ir_eval_file = "ir_eval.csv"
df = pd.read_csv(ir_eval_file)
df_mean = df[df['query_number'] == 'mean']
columns = ['P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20']
for column in columns:
    best_model = df_mean[df_mean[column] == df_mean[column].max()]['system_number'].values[0]
    sorted_df = df_mean.sort_values(by=column, ascending=False)
    second_best_model = sorted_df.iloc[1]['system_number']
            
    # t_tail test between best and second best model, p = 0.05
    best_model_values = df[df['system_number'] == best_model][column].values
    best_model_values = best_model_values[best_model_values != 'mean']
    second_best_model_values = df[df['system_number'] == second_best_model][column].values
    second_best_model_values = second_best_model_values[second_best_model_values != 'mean']

    t_statistic = (best_model_values.mean() - second_best_model_values.mean()) / np.sqrt((best_model_values.var() / len(best_model_values)) + (second_best_model_values.var() / len(second_best_model_values)))
    p_value = 2 * (1 - stats.t.cdf(t_statistic, len(best_model_values) + len(second_best_model_values) - 2))
    print("Best Model for", column, ":", best_model)
    print("Second Best Model for", column, ":", second_best_model)

    print("Best mean value for", column, ":", best_model_values.mean())
    print("Second best mean value for", column, ":", second_best_model_values.mean())

    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
    print("")

Best Model for P@10 : 3
Second Best Model for P@10 : 5
Best mean value for P@10 : 0.41
Second best mean value for P@10 : 0.41
t-statistic: 0.0
p-value: 1.0

Best Model for R@50 : 2
Second Best Model for R@50 : 1
Best mean value for R@50 : 0.8670909090909091
Second best mean value for R@50 : 0.8338181818181819
t-statistic: 0.4494434129732087
p-value: 0.6579445985022723

Best Model for r-precision : 3
Second Best Model for r-precision : 6
Best mean value for r-precision : 0.4484545454545455
Second best mean value for r-precision : 0.4484545454545455
t-statistic: 0.0
p-value: 1.0

Best Model for AP : 3
Second Best Model for AP : 6
Best mean value for AP : 0.4512727272727273
Second best mean value for AP : 0.4448181818181819
t-statistic: 0.048233964804110775
p-value: 0.9620082049233663

Best Model for nDCG@10 : 3
Second Best Model for nDCG@10 : 6
Best mean value for nDCG@10 : 0.5919090909090908
Second best mean value for nDCG@10 : 0.5709090909090908
t-statistic: 0.15075824655863304
p-value

In [69]:
for column in columns:
    sorted_df = df_mean.sort_values(by=column, ascending=False)
    print(sorted_df[['system_number', column]])

    system_number  P@10
32              3  0.41
54              5  0.41
65              6  0.41
10              1  0.39
21              2  0.22
43              4  0.08
    system_number   R@50
21              2  0.867
10              1  0.834
32              3  0.767
54              5  0.767
65              6  0.767
43              4  0.189
    system_number  r-precision
32              3        0.448
65              6        0.448
10              1        0.401
54              5        0.358
21              2        0.252
43              4        0.049
    system_number     AP
32              3  0.451
65              6  0.445
10              1  0.400
54              5  0.364
21              2  0.300
43              4  0.075
    system_number  nDCG@10
32              3    0.592
65              6    0.571
10              1    0.547
54              5    0.463
21              2    0.404
43              4    0.289
    system_number  nDCG@20
32              3    0.584
10              1    0

In [281]:
ot_mi_x2 = pd.read_csv('ot_mi_x2.csv')
nt_mi_x2 = pd.read_csv('nt_mi_x2.csv')
quran_mi_x2 = pd.read_csv('quran_mi_x2.csv')

corpus_mi_x2 = [ot_mi_x2, nt_mi_x2, quran_mi_x2]
corpus_names = ['OT', 'NT', 'Quran']
for i in range(len(corpus_mi_x2)):
    corpus = corpus_mi_x2[i]
    corpus.sort_values(by='Mutual Information', ascending=False, inplace=True)
    print(corpus_names[i])
    print("Sorted by Mutual Information")
    print(corpus[['Word', 'Mutual Information']].head(20))
    print("")
    print("Sorted by Chi-Squared")
    corpus.sort_values(by='Chi-Squared', ascending=False, inplace=True)
    print(corpus[['Word', 'Chi-Squared']].head(20))
    print("")


OT
Sorted by Mutual Information
        Word  Mutual Information
0       jesu            0.037253
1     israel            0.031138
2       king            0.026041
3       lord            0.025915
4     christ            0.019866
5     believ            0.017963
6        god            0.016433
7   muhammad            0.015523
8        son            0.012956
9    torment            0.012688
10     judah            0.012488
11      land            0.012389
12     faith            0.011932
13    receiv            0.010553
14      hous            0.010154
15   discipl            0.009426
16     revel            0.009049
17     david            0.008662
18  unbeliev            0.008377
19    suffer            0.008020

Sorted by Chi-Squared
        Word  Chi-Squared
0       jesu  1464.155405
3       lord  1122.407428
1     israel  1095.934739
2       king   946.107111
6        god   784.411580
4     christ   779.406605
5     believ   750.066566
7   muhammad   608.737671
12     faith   543

In [305]:
negative_mi_x2 = pd.read_csv('negative_mi_x2.csv')
positive_mi_x2 = pd.read_csv('positive_mi_x2.csv')
neutral_mi_x2 = pd.read_csv('neutral_mi_x2.csv')

sentiment_mi_x2 = [negative_mi_x2, positive_mi_x2, neutral_mi_x2]
sentiment_names = ['Negative', 'Positive', 'Neutral']

for i in range(len(sentiment_mi_x2)):
    sentiment = sentiment_mi_x2[i]
    # sentiment.sort_values(by='Mutual Information', ascending=False, inplace=True)
    # print(sentiment_names[i])
    # print("Sorted by Mutual Information")
    # print(sentiment[['Word', 'Mutual Information']].head(10))
    # print("")
    print("Sorted by Chi-Squared")
    sentiment.sort_values(by='Chi-Squared', ascending=False, inplace=True)
    print(sentiment[['Word', 'Chi-Squared']].head(10))
    print("")

Sorted by Chi-Squared
             Word  Chi-Squared
6131         fuck   291.720358
5870        trump   264.124993
6385         nazi   243.258834
7593        liber   196.068130
850          kill   166.757587
4526  supremacist   153.586713
7871         shit   144.998009
4986     tomorrow   141.918684
1278         hate   136.328450
4177      leftist   135.204272

Sorted by Chi-Squared
          Word  Chi-Squared
7558      love   432.412343
5447     happi   417.099110
4986  tomorrow   270.072825
8904     great   259.995147
3409       day   249.174868
1099      good   227.620259
5579     excit   213.417963
5870     trump   187.044896
6694  birthday   182.047174
2150     cream   134.707281

Sorted by Chi-Squared
          Word  Chi-Squared
5447     happi   188.578216
7558      love   185.970397
8904     great   117.515366
2780      http   116.613708
1099      good   110.814336
5579     excit    97.672683
3409       day    89.804153
6131      fuck    82.820656
5608     enjoy    62.749695
669