In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import re
import nltk

# some prereqs:
import collections

# regular expressions
import re

# for string.punctuation: list of punctuation characters
import string

import sklearn

# import this for storing our BOW format
import scipy
from scipy import sparse

# numpy for more easily storing multidimensional data
import numpy as np

# scikit learn. Contains lots of ML models we can use
# import the library for support vector machines
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import classification_report

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# IR Evaluation

In [2]:
relevant_docs = pd.read_csv('qrels.csv')
system_results = pd.read_csv('ttdssystemresults.csv')


In [3]:
def precision(retrieved, relevant):
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)
    return len(retrieved_set.intersection(relevant_set)) / len(retrieved_set)

def recall(retrieved, relevant):
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)
    return len(retrieved_set.intersection(relevant_set)) / len(relevant_set)

def average_precision(retrieved, relevant):
    precision_sum = 0
    for i in range(1, len(retrieved) + 1):
        if retrieved[i - 1] in relevant:
            precision_sum += precision(retrieved[:i], relevant)
    return precision_sum / len(relevant)

def nDCG(retrieved, relevant, relevance_scores):
    scores = {}
    for i in range(len(relevant)):
        scores[relevant[i]] = relevance_scores[i]
    retrieved_scores = []
    for doc in retrieved:
        if doc in scores:
            retrieved_scores.append(scores[doc])
        else:
            retrieved_scores.append(0)

    DCG = retrieved_scores[0]
    for i in range(1, len(retrieved_scores)):
        DCG += retrieved_scores[i] / np.log2(i + 1)
    retrieved_scores.sort(reverse=True)
    IDCG = retrieved_scores[0]
    for i in range(1, len(retrieved_scores)):
        IDCG += retrieved_scores[i] / np.log2(i + 1)
    if IDCG == 0:
        return 0
    
    print("DCG", DCG)
    print("IDCG", IDCG)
    return DCG / IDCG


In [4]:
query_ids = system_results['query_number'].unique()
systems = system_results['system_number'].unique()

results = pd.DataFrame(columns=['system_number', 'query_number', 'P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20'])

for system in systems:
    print("Processing system", system)
    precision_10s = []
    recall_50s = []
    r_precisions = []
    aps = []
    nDCG_10s = []
    nDCG_20s = []
    for query_id in query_ids:
        print("Query ID", query_id)
        relevant = relevant_docs[relevant_docs['query_id'] == query_id]['doc_id'].values
        relevance_scores = relevant_docs[relevant_docs['query_id'] == query_id]['relevance'].values
        retrieved = system_results[(system_results['query_number'] == query_id) & (system_results['system_number'] == system)].sort_values(by='rank_of_doc')['doc_number'].values

        precision_10 = precision(retrieved[:10], relevant)
        recall_50 = recall(retrieved[:50], relevant)
        r_precision = precision(retrieved[:len(relevant)], relevant)
        ap = average_precision(retrieved, relevant)
        nDCG_10 = nDCG(retrieved[:10], relevant, relevance_scores)
        nDCG_20 = nDCG(retrieved[:20], relevant, relevance_scores)
        
        results.loc[len(results)] = [int(system), int(query_id), precision_10, recall_50, r_precision, ap, nDCG_10, nDCG_20]

        precision_10s.append(precision_10)
        recall_50s.append(recall_50)
        r_precisions.append(r_precision)
        aps.append(ap)
        nDCG_10s.append(nDCG_10)
        nDCG_20s.append(nDCG_20)

        
    results.loc[len(results)] = [int(system), 'mean', np.mean(precision_10s), np.mean(recall_50s), np.mean(r_precisions), np.mean(aps), np.mean(nDCG_10s), np.mean(nDCG_20s)]


Processing system 1
Query ID 1
DCG 2.997147735133648
IDCG 6.7618595071429155
DCG 2.997147735133648
IDCG 6.7618595071429155
Query ID 2
DCG 1.7317065537373744
IDCG 2.6309297535714578
DCG 3.855081630315694
IDCG 7.268929392892205
Query ID 3
DCG 0.6941346394792774
IDCG 3.0
Query ID 4
DCG 5.902918195508732
IDCG 8.579388872450851
DCG 8.995617437316119
IDCG 12.803884063230115
Query ID 5
DCG 1.0177825608059992
IDCG 2.0
DCG 1.2575950273741305
IDCG 2.6309297535714578
Query ID 6
DCG 6.508353076911336
IDCG 8.304666305987414
DCG 8.320363096230484
IDCG 11.084361790882394
Query ID 7
DCG 3.3010299956639813
IDCG 4.0
DCG 3.3010299956639813
IDCG 4.0
Query ID 8
DCG 6.018590298918789
IDCG 7.57938887245085
DCG 6.956195252178463
IDCG 9.768929392892208
Query ID 9
DCG 6.6137831393950375
IDCG 8.584394269677935
DCG 9.910340050543956
IDCG 13.294270125579557
Query ID 10
DCG 1.1821938260239127
IDCG 5.0
Processing system 2
Query ID 1
DCG 0.6666666666666666
IDCG 2.0
DCG 1.2245525579689263
IDCG 4.0
Query ID 2
DCG 0.830

In [5]:
results['system_number'] = results['system_number'].astype(int)
results['query_number'] = results['query_number'].apply(lambda x: 'mean' if x == 'mean' else int(x))
results = results.round(3)
save_results = results.to_csv('ir_eval.csv', index=False)


In [6]:
mean_results = results[results['query_number'] == 'mean']
for column in ['P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20']:
    df = mean_results[['system_number', column]]
    df = df.set_index('system_number')
    df.sort_values(by=column, ascending=False, inplace=True)
    print("Best Model for", column, ":", df.index[0])

    best_model = df.index[0]
    second_best_model = df.index[1]
    best_model_value = df.loc[best_model][column]
    second_best_model_value = df.loc[second_best_model][column]
    # two tailed t-test
    print(best_model, "vs", second_best_model)
    print(best_model_value, "vs", second_best_model_value)
    t_statistic = (best_model_value - second_best_model_value) / np.sqrt((best_model_value * (1 - best_model_value) / 50) + (second_best_model_value * (1 - second_best_model_value) / 50))
    print("t-statistic:", t_statistic)
    print("p-value:", 2 * (1 - stats.t.cdf(t_statistic, 98)))
    print("")

    

Best Model for P@10 : 3
3 vs 5
0.41 vs 0.41
t-statistic: 0.0
p-value: 1.0

Best Model for R@50 : 2
2 vs 1
0.867 vs 0.834
t-statistic: 0.46322462441985235
p-value: 0.6442304510971641

Best Model for r-precision : 3
3 vs 6
0.448 vs 0.448
t-statistic: 0.0
p-value: 1.0

Best Model for AP : 3
3 vs 6
0.451 vs 0.445
t-statistic: 0.060328233925037125
p-value: 0.952017072315338

Best Model for nDCG@10 : 3
3 vs 6
0.592 vs 0.571
t-statistic: 0.21289482573886245
p-value: 0.8318512285899999

Best Model for nDCG@20 : 3
3 vs 1
0.584 vs 0.566
t-statistic: 0.18209000555013286
p-value: 0.8558882168842699



# Text Analysis

In [7]:
bible_and_quran = pd.read_csv("bible_and_quran.tsv", sep='\t', header=None)
bible_and_quran.columns = ['Source', 'Text']
ot, nt, quran = bible_and_quran[bible_and_quran['Source'] == 'OT'], bible_and_quran[bible_and_quran['Source'] == 'NT'], bible_and_quran[bible_and_quran['Source'] == 'Quran']

In [16]:
# def tokenize(text):
#     """
#     Tokenize the text and return a list of words.
#     Tokenisation is done by splitting the text, making it lowercase, and replacing any non-alphanumeric characters with spaces.
#     :param text: The text to be tokenized
#     :return: A list of words
#     """
#     tokens = text.split()
#     words = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens]
#     words = [word.lower() for word in words if word != '' or word != ' ']
#     words = [word.strip() for word in words]
#     return words

# def remove_stopwords(words, stop_words_file="stop_words.txt"):
#     """
#     Remove stopwords from the list of words and return the filtered list.
#     Stopwords are read from the file specified in the stop_words_file parameter.
#     :param words: The list of words
#     :param stop_words_file: The path to the file containing the stopwords
#     :return: The filtered list of words
#     """
#     with open(stop_words_file, 'r') as f:
#         stop_words = f.readlines()
#     stop_words = [word.strip() for word in stop_words]
#     filtered_words = [word for word in words if word not in stop_words]
#     return filtered_words

# def stem(words):
#     """
#     Stem the words using the Porter stemmer and return the list of stemmed words.
#     :param words: The list of words
#     :return: The list of stemmed words
#     """
#     stemmer = nltk.stem.PorterStemmer()
#     return [stemmer.stem(word) for word in words]

def preprocess(text, stop_words_file="stop_words.txt"):
    """
    Preprocess the text by tokenizing, removing stopwords, and stemming the words.
    :param text: The text to be preprocessed
    :param stopping: A boolean value indicating whether to remove stopwords or not
    :return: The list of preprocessed words
    """
    with open(stop_words_file, 'r') as f:
        stop_words = f.readlines()
    stop_words = [word.strip() for word in stop_words]

    tokens = text.split()
    tokens = [token.strip() for token in tokens]
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stop_words]
    words = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens]
    words = [word for word in words if word != '' or word != ' ']
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return words

def preprocess_corpus(corpus):
    """
    Preprocess the entire corpus by tokenizing, removing stopwords, and stemming the words.
    :param corpus: The corpus to be preprocessed
    :param stopping: A boolean value indicating whether to remove stopwords or not
    :return: The list of preprocessed words
    """
    preprocessed_corpus = []
    for doc in corpus['Text']:
        words = preprocess(doc)
        preprocessed_corpus.append(words)
    return preprocessed_corpus

In [17]:
ot_tokens = preprocess_corpus(ot)
nt_tokens = preprocess_corpus(nt)
quran_tokens = preprocess_corpus(quran)

In [18]:
# Compute Mutual Information

def mutual_information_and_chi_squared(word, corpus, other_corpuses):
    """
    Compute the mutual information of a word in two corpora.
    :param word: The word for which to compute the mutual information
    :param corpus1: The first corpus
    :param corpus2: The second corpus
    :return: The mutual information of the word in the two corpora
    """

    N_11 = 0
    N_10 = 0
    N_01 = 0
    N_00 = 0

    for doc in corpus:
        if word in doc:
            N_11 += 1
        else:
            N_01 += 1
    for doc in other_corpuses:
        if word in doc:
            N_10 += 1
        else:
            N_00 += 1

    N = N_11 + N_10 + N_01 + N_00
    if N_00 == 0 or N_01 == 0 or N_10 == 0 or N_11 == 0:
        return 0, 0

    mi = N_11 / N * np.log2(N * N_11 / ((N_10 + N_11) * (N_01 + N_11))) + \
        N_01 / N * np.log2(N * N_01 / ((N_00 + N_01) * (N_01 + N_11))) + \
        N_10 / N * np.log2(N * N_10 / ((N_10 + N_11) * (N_00 + N_10))) + \
        N_00 / N * np.log2(N * N_00 / ((N_00 + N_01) * (N_00 + N_10)))
    
    chi_squared = N * (N_11 * N_00 - N_10 * N_01) ** 2 / ((N_11 + N_01) * (N_11 + N_10) * (N_10 + N_00) * (N_01 + N_00))
    
    return mi, chi_squared

def top_mutual_information_chi_squared(corpus, other_corpuses):
    """
    Compute the top n words with the highest mutual information in two corpora.
    :param corpus1: The first corpus
    :param corpus2: The second corpus
    :param n: The number of words to return
    :return: A df containing the words and their mutual information and chi-squared values
    """
    df = pd.DataFrame(columns=['Word', 'Mutual Information', 'Chi-Squared'])
    words = set()
    for doc in corpus:
        new_words = set(doc)
        words = words.union(new_words)
    # for doc in other_corpuses:
    #     new_words = set(doc)
    #     words = words.union(new_words)
    print("Unique Tokens: " + str(len(words)))
    i = 0
    for word in words:
        mi, chi_squared = mutual_information_and_chi_squared(word, corpus, other_corpuses)
        df.loc[len(df)] = [word, mi, chi_squared]
        i += 1
        if i % 1000 == 0:
            print(str(i) + " tokens processed")
    return df

In [20]:
# ot_top_mi_chi = top_mutual_information_chi_squared(ot_tokens, nt_tokens+quran_tokens)
# ot_top_mi_chi.sort_values(by='Mutual Information', ascending=False, inplace=True)
# ot_top_mi_chi.to_csv('ot_mi_x2.csv', index=False)

nt_top_mi_chi = top_mutual_information_chi_squared(nt_tokens, ot_tokens+quran_tokens)
nt_top_mi_chi.sort_values(by='Mutual Information', ascending=False, inplace=True)
nt_top_mi_chi.to_csv('nt_mi_x2.csv', index=False)

quran_top_mi_chi = top_mutual_information_chi_squared(quran_tokens, ot_tokens+nt_tokens)
quran_top_mi_chi.sort_values(by='Mutual Information', ascending=False, inplace=True)
quran_top_mi_chi.to_csv('quran_mi_x2.csv', index=False)



Unique Tokens: 3951
1000 tokens processed
2000 tokens processed
3000 tokens processed
Unique Tokens: 3348
1000 tokens processed
2000 tokens processed
3000 tokens processed


# Topic Modelling (LDA with Gibbs Sampling)

In [21]:
from gensim import corpora
from gensim.models import LdaModel

text = ot_tokens + nt_tokens + quran_tokens

dictionary = corpora.Dictionary(text)
corpus = [dictionary.doc2bow(doc) for doc in text]

lda_model = LdaModel(corpus, num_topics=20, id2word=dictionary)


In [22]:
# For each corpus, compute the average score for each topic by summing the document-topic probability for each document in that corpus and dividing by the total number of documents in the corpus. 

corpuses = [ot_tokens, nt_tokens, quran_tokens]
corpus_names = ['OT', 'NT', 'Quran']
corpus_topics = dict()
for i in range(len(corpuses)):
    corpus = corpuses[i]
    corpus_name = corpus_names[i]
    corpus_topic = np.zeros(20)
    for doc in corpus:
        bow = dictionary.doc2bow(doc)
        doc_topics = lda_model.get_document_topics(bow)
        for topic in doc_topics:
            corpus_topic[topic[0]] += topic[1]
    corpus_topic /= len(corpus)
    corpus_topics[corpus_name] = corpus_topic
    



In [23]:
# top 3 topics for each corpus
for corpus_name in corpus_names:
    print("Top 3 topics for", corpus_name)
    topics = corpus_topics[corpus_name]
    top_topics = np.argsort(topics)[::-1][:3]
    for topic in top_topics:
        print("Topic", topic, ":", lda_model.print_topic(topic))
    print("")

Top 3 topics for OT
Topic 13 : 0.090*"god" + 0.063*"believ" + 0.042*"lord" + 0.031*"suffer" + 0.029*"you" + 0.027*"reward" + 0.026*"peopl" + 0.025*"judgment" + 0.022*"them" + 0.020*"righteous"
Topic 2 : 0.055*"god" + 0.048*"said" + 0.036*"them" + 0.033*"i" + 0.029*"peopl" + 0.028*"say" + 0.026*"seek" + 0.026*"you" + 0.025*"him" + 0.023*"lord"
Topic 4 : 0.033*"other" + 0.031*"abraham" + 0.030*"deni" + 0.025*"measur" + 0.024*"children" + 0.020*"sea" + 0.020*"tribe" + 0.017*"disput" + 0.017*"inherit" + 0.016*"increas"

Top 3 topics for NT
Topic 2 : 0.055*"god" + 0.048*"said" + 0.036*"them" + 0.033*"i" + 0.029*"peopl" + 0.028*"say" + 0.026*"seek" + 0.026*"you" + 0.025*"him" + 0.023*"lord"
Topic 7 : 0.104*"god" + 0.051*"earth" + 0.050*"heaven" + 0.045*"worship" + 0.043*"you" + 0.024*"power" + 0.023*"them" + 0.023*"lord" + 0.022*"evil" + 0.021*"speak"
Topic 17 : 0.058*"god" + 0.046*"thing" + 0.033*"favor" + 0.031*"spirit" + 0.024*"lord" + 0.023*"time" + 0.022*"you" + 0.022*"thi" + 0.021*"war

# Text Classification

In [9]:
# twitter_file = "train.txt"
# twitter = pd.read_csv(twitter_file, sep='\t', header=None)
# twitter.columns = ['ID','Sentiment', 'Text']
# twitter = twitter[1:]
# twitter.drop(columns=['ID'], inplace=True)


In [10]:
def preprocess_data(data):
    
    chars_to_remove = re.compile(f'[{string.punctuation}]')
    
    documents = []
    categories = []
    
    lines = data.split('\n')
    
    for line in lines[1:]:
        # make a dictionary for each document
        # word_id -> count (could also be tf-idf score, etc.)
        line = line.strip()
        if line:
            # split on tabs, we have 3 columns in this tsv format file
            tweet_id, category, tweet = line.split('\t')

            # process the words
            words = chars_to_remove.sub('',tweet).lower().split()
            # add the list of words to the documents list
            documents.append(words)
            # add the category to the categories list
            categories.append(category)
            
    return documents, categories

In [11]:
twitter_file = "train.txt"
with open(twitter_file, 'r', encoding='utf-8') as f:
    data = f.read()
documents, categories = preprocess_data(data)

In [12]:
# Split into training and test sets

ratio = 0.8
split = int(len(documents) * ratio)
train_docs = documents[:split]
train_cat = categories[:split]
test_docs = documents[split:]
test_cats = categories[split:]

train_vocab = set()
for doc in train_docs:
    for word in doc:
        train_vocab.add(word)

test_vocab = set()
for doc in test_docs:
    for word in doc:
        test_vocab.add(word)


In [13]:
word2id = {}
for word_id,word in enumerate(train_vocab):
    word2id[word] = word_id
    
cat2id = {}
for cat_id,cat in enumerate(set(train_cat)):
    cat2id[cat] = cat_id

In [14]:
# build a BOW representation of the files: use the scipy 
# data is the preprocessed_data
# word2id maps words to their ids
def convert_to_bow_matrix(preprocessed_data, word2id):
    
    # matrix size is number of docs x vocab size + 1 (for OOV)
    matrix_size = (len(preprocessed_data),len(word2id)+1)
    oov_index = len(word2id)
    # matrix indexed by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)

    # iterate through all documents in the dataset
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            # default is 0, so just add to the count for this word in this doc
            # if the word is oov, increment the oov_index
            X[doc_id,word2id.get(word,oov_index)] += 1
    
    return X

X_train = convert_to_bow_matrix(train_docs, word2id)
y_train = [cat2id[cat] for cat in train_cat]

In [24]:
def get_train_test_data(file, ratio):
    with open(file, 'r', encoding='utf-8') as f:
        data = f.read()
    documents, categories = preprocess_data(data)
    split = int(len(documents) * ratio)
    train_docs = documents[:split]
    train_cat = categories[:split]
    test_docs = documents[split:]
    test_cats = categories[split:]

    train_vocab = set()
    for doc in train_docs:
        for word in doc:
            train_vocab.add(word)

    test_vocab = set()
    for doc in test_docs:
        for word in doc:
            test_vocab.add(word)

    word2id = {}
    for word_id,word in enumerate(train_vocab):
        word2id[word] = word_id
    
    cat2id = {}
    for cat_id,cat in enumerate(set(train_cat)):
        cat2id[cat] = cat_id

    X_train = convert_to_bow_matrix(train_docs, word2id)
    y_train = [cat2id[cat] for cat in train_cat]

    X_test = convert_to_bow_matrix(test_docs, word2id)
    y_test = [cat2id[cat] for cat in test_cats]

    return X_train, y_train, X_test, y_test

In [15]:
# model = sklearn.svm.SVC(C=1000, kernel ="linear")
# # then train the model!
# model.fit(X_train,y_train)

In [17]:
# y_train_predictions = model.predict(X_train)

# now can compute any metrics we care about. Let's quickly do accuracy
def compute_accuracy(predictions, true_values):
    num_correct = 0
    num_total = len(predictions)
    for predicted,true in zip(predictions,true_values):
        if predicted==true:
            num_correct += 1
    return num_correct / num_total

# accuracy = compute_accuracy(y_train_predictions,y_train)
# print("Accuracy:",accuracy)

In [20]:

# y_test_predictions = model.predict(X_test)
# cat_names = []
# for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
#     cat_names.append(cat)
# print(classification_report(y_test, y_test_predictions, target_names=cat_names))

In [21]:
# model = sklearn.ensemble.RandomForestClassifier()
# model.fit(X_train,y_train)

# y_train_predictions = model.predict(X_train)
# print("Train accuracy was:",compute_accuracy(y_train_predictions,y_train))
# y_test_predictions = model.predict(X_test)
# print("Test accuracy was:",compute_accuracy(y_test_predictions,y_test))

In [22]:
def get_model_accuracy(model, X_train, y_train, X_test, y_test):
    model.fit(X_train,y_train)
    y_train_predictions = model.predict(X_train)
    train_accuracy = compute_accuracy(y_train_predictions,y_train)
    y_test_predictions = model.predict(X_test)
    test_accuracy = compute_accuracy(y_test_predictions,y_test)
    return train_accuracy, test_accuracy

# model_list = [sklearn.svm.SVC(C=1000, kernel ="linear"), sklearn.ensemble.RandomForestClassifier(), sklearn.ensemble.GradientBoostingClassifier(), sklearn.linear_model.LogisticRegression(), sklearn.naive_bayes.MultinomialNB(), sklearn.naive_bayes.BernoulliNB(), sklearn.neighbors.KNeighborsClassifier(), sklearn.tree.DecisionTreeClassifier(), sklearn.neural_network.MLPClassifier()]
# model_names = ['SVM', 'Random Forest', 'Gradient Boosting', 'Logistic Regression', 'Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'K-Nearest Neighbors', 'Decision Tree', 'MLP']

# train_accuracies = []
# test_accuracies = []

# for model in model_list:
#     train_accuracy, test_accuracy = get_model_accuracy(model, X_train, y_train, X_test, y_test)
#     train_accuracies.append(train_accuracy)
#     test_accuracies.append(test_accuracy)
#     print("Model:", model)
#     print("Train accuracy was:", train_accuracy)
#     print("Test accuracy was:", test_accuracy)
#     print("")

In [25]:
# Logistic regression parameter sweep

X_train, y_train, X_test, y_test = get_train_test_data("train.txt", 0.9)

C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
train_accuracies = []
test_accuracies = []

for C in C_values:
    model = sklearn.linear_model.LogisticRegression(C=C)
    train_accuracy, test_accuracy = get_model_accuracy(model, X_train, y_train, X_test, y_test)
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
    print("Model:", model)
    print("Train accuracy was:", train_accuracy)
    print("Test accuracy was:", test_accuracy)
    print("")

Model: LogisticRegression(C=0.001)
Train accuracy was: 0.5226744532507002
Test accuracy was: 0.500804289544236

Model: LogisticRegression(C=0.01)
Train accuracy was: 0.6384005720755617
Test accuracy was: 0.5715817694369973

Model: LogisticRegression(C=0.1)
Train accuracy was: 0.8103211965913831
Test accuracy was: 0.6300268096514745



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression(C=1)
Train accuracy was: 0.9737202788868363
Test accuracy was: 0.6187667560321716



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression(C=10)
Train accuracy was: 0.9966033013527203
Test accuracy was: 0.5989276139410188



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression(C=100)
Train accuracy was: 0.9991657231392647
Test accuracy was: 0.5903485254691689

Model: LogisticRegression(C=1000)
Train accuracy was: 0.9994040879566176
Test accuracy was: 0.5898123324396782

