**Include reference to used public repos!**





In [19]:
import pickle


def read_data():
    with open('./data/preprocessed/train_list_sentence1.txt', "rb") as file:
        train_list_sentence1 = pickle.load(file)

    with open('./data/preprocessed/train_list_sentence2.txt', "rb") as file:
        train_list_sentence2 = pickle.load(file)

    with open('./data/preprocessed/train_list_gold_label.txt', "rb") as file:
        train_list_gold_label = pickle.load(file)

    with open('./data/preprocessed/test_list_sentence1.txt', "rb") as file:
        test_list_sentence1 = pickle.load(file)

    with open('./data/preprocessed/test_list_sentence2.txt', "rb") as file:
        test_list_sentence2 = pickle.load(file)

    with open('./data/preprocessed/test_list_gold_label.txt', "rb") as file:
        test_list_gold_label = pickle.load(file)
        
    with open('./data/preprocessed/lalor_list_sentence1.txt', "rb") as file:
        lalor_list_sentence1 = pickle.load(file)

    with open('./data/preprocessed/lalor_list_sentence2.txt', "rb") as file:
        lalor_list_sentence2 = pickle.load(file)

    with open('./data/preprocessed/lalor_list_gold_label.txt', "rb") as file:
        lalor_list_gold_label = pickle.load(file)        
    

    data = [[train_list_sentence1], [train_list_sentence2], [train_list_gold_label],
            [test_list_sentence1], [test_list_sentence2], [test_list_gold_label],
            [lalor_list_sentence1], [lalor_list_sentence2], [lalor_list_gold_label],
           ]

    return data

In [20]:
import pickle
import numpy as np
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer


# Process the data and return the TFIDF features and the labels for the data
def TFIDF_features(data, mode):
    # Get the sentences and labels from composite data
    list_sentence1 = data[0][0]
    list_sentence2 = data[1][0]
    list_gold_label = data[2][0]

    # Merge each sublist (tokens list of each sentence) to a string
    corpus_sentence1 = [' '.join(item) for item in list_sentence1]
    corpus_sentence2 = [' '.join(item) for item in list_sentence2]
    num_samples = len(list_gold_label)

    # Create a composite corpus over which to train the TFIDF Vectorizer
    # Corresponding lines of sentence1 and sentence2 are merged together
    corpus = [corpus_sentence1[ind] + " " + corpus_sentence2[ind] for ind in range(num_samples)]

    del_list =[]
    tfidf_labels = [None] * num_samples
    for ind, item in enumerate(list_gold_label):
        if item == "contradiction":
            tfidf_labels[ind] = 0
        elif item == "neutral":
            tfidf_labels[ind] = 1
        elif item == "entailment":
            tfidf_labels[ind] = 2
        else:
            tfidf_labels[ind] = 99
            del_list.append(ind)

    # Delete entries with gold_label "-"
    del_list.sort(reverse=True)
    for ind in del_list:
        del corpus[ind]
        del corpus_sentence1[ind]
        del corpus_sentence2[ind]
        del tfidf_labels[ind]

    # If mode is training we fit our TFIDF Vectorizer over our composite corpus and store it in
    # pickle format. During testing time, we retrieve this same vectorizer to generate TFIDF
    # representations for out text input
    if mode == "train":
        TFIDF_vect = TfidfVectorizer()
        TFIDF_vect.fit(corpus)
        
        with open('./checkpoints/TF-IDF/TFIDF.pickle', "wb") as file:
            pickle.dump(TFIDF_vect, file)

    elif mode == "test":
        with open('./checkpoints/TF-IDF/TFIDF.pickle', "rb") as file:
            TFIDF_vect = pickle.load(file)

    else:
        print("Invalid mode selection")
        exit(0)

    # Generate TFIDF representations for out dataset
    tfidf_sentecnce1 = TFIDF_vect.transform(corpus_sentence1)
    tfidf_sentecnce2 = TFIDF_vect.transform(corpus_sentence2)

    # TFIDF vectors for sentence1 and sentence2 and concatenated
    tfidf_feature_array = scipy.sparse.hstack((tfidf_sentecnce1, tfidf_sentecnce2))

    return tfidf_feature_array, tfidf_labels

In [21]:
# Imports
import pickle
from sklearn.linear_model import LogisticRegression


# Trains and stores a logistic regression model
def logistic_regression_train(train_data):
    # Obtain the TFIDF features
    train_feature, train_label = TFIDF_features(train_data, "train")

    # Train the logistic regression model
    LR_model = LogisticRegression(random_state=0, max_iter=1000, solver='lbfgs', multi_class='auto')
    LR_model.fit(train_feature, train_label)

    # Save the logistic regression model as a pickle file
    with open('./checkpoints/TF-IDF/LR.pickle', "wb") as file:
        pickle.dump(LR_model, file)

    print("Training complete.\n")

In [22]:
data = read_data()
train_data = data[:3]
test_data = data[3:7]
lalor_data = data[6:]

In [23]:
logistic_regression_train(train_data)

Training complete.



In [24]:
# Imports
import pickle

# Loads and tests the logistic regression model
def logistic_regression_test(test_data):
    # Obtain the TFIDF features
    test_feature, test_label = TFIDF_features(test_data, "test")

    # Loads the logistic regression model from pickle file
    with open('./checkpoints/TF-IDF/LR.pickle', "rb") as file:
        LR_model = pickle.load(file)

    # Tests the logistic regression model
    pred_labels = LR_model.predict(test_feature)
    pred_prob = LR_model.predict_proba(test_feature)

    with open('./output/tfidf.txt', "w") as file:
        for item in pred_labels:
            if item == 0:
                file.write("contradiction\n")
            elif item == 1:
                file.write("neutral\n")
            elif item == 2:
                file.write("entailment\n")
            else:
                pass

    # Evaluate and print the results
    score = LR_model.score(test_feature, test_label) * 100
    print("The classification accuracy for Logistic regression with TF-IDF features is {:.2f}%.".format(score))
    
    return pred_labels, pred_prob

In [25]:
logistic_regression_test(test_data)

The classification accuracy for Logistic regression with TF-IDF features is 63.58%.


(array([0, 1, 2, ..., 0, 2, 1]),
 array([[0.46715191, 0.25189428, 0.28095381],
        [0.25301601, 0.4714402 , 0.27554379],
        [0.23638262, 0.31943414, 0.44418324],
        ...,
        [0.70467709, 0.11524288, 0.18008003],
        [0.29206364, 0.15854118, 0.54939518],
        [0.23228765, 0.6084261 , 0.15928625]]))

In [26]:
pred, pred_prob = logistic_regression_test(lalor_data)

The classification accuracy for Logistic regression with TF-IDF features is 51.11%.


In [27]:
import pandas as pd

df_lalor = pd.read_csv('../../data/human/SNLI-lalor/snli_human_4gs.csv', sep=',', header=0)


In [28]:
df_lalor['prob_0'] = ''
df_lalor['prob_1'] = ''
df_lalor['prob_2'] = ''
df_lalor['pred'] = ''
df_lalor['entropy'] = ''
df_lalor

Unnamed: 0,sample_id,snli_id,sentence_1,sentence_2,label,item_difficulty,average_accuracy,flesch_score_textstat,mean_grade_level_textstat,number_of_words,number_of_characters,prob_0,prob_1,prob_2,pred,entropy
0,0,1947351225.jpg#0r1c,A little boy is opening gifts surrounded by a ...,The boy is being punished,contradiction,-1.759822,0.839139,78.75,6.0,19,101,,,,,
1,1,3626964430.jpg#0r1e,"People playing cricket in the park, pine trees...","People are playing sports in the park, near th...",entailment,-2.179087,0.886270,94.15,5.0,22,118,,,,,
2,2,4576144189.jpg#3r1e,Some people hanging out on a large backyard deck.,people hanging out on deck,entailment,-3.137178,0.951844,81.29,3.0,14,76,,,,,
3,3,507370108.jpg#3r1n,A group of dancers are performing.,The audience is silent.,neutral,-1.982105,0.865779,83.32,3.0,10,58,,,,,
4,4,3361210233.jpg#0r1n,A large brown and white dog is carrying a stic...,A puppy is playing fetch with a stick.,neutral,-0.280872,0.565574,93.14,4.0,24,118,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,85,3381788544.jpg#0r1c,"A group of asian women in sports attire, and o...",Men are playing with a dog,contradiction,-0.622317,0.641393,84.17,5.0,25,122,,,,,
86,86,3070485870.jpg#3r1n,A snowboarder is jumping over a snow slope.,A girl jumps her green snowboard over a slope.,neutral,-0.885338,0.695697,96.69,6.0,17,90,,,,,
87,87,4788967880.jpg#0r1n,a group of people on a dock lowering into the ...,The group was getting ready to go fishing on t...,neutral,-1.689649,0.829918,73.68,7.0,29,147,,,,,
88,88,4831683216.jpg#0r1c,A young girl in a bathing suit drinking a beve...,The girl is drinking milk from a sippy cup.,contradiction,0.523880,0.381148,85.18,6.0,23,113,,,,,


In [29]:
df_lalor['snli_id'] = df_lalor['snli_id'].str.replace('\t','')

In [30]:
sample_indices = {}

for ind, row in df_lalor.iterrows():
    sample_indices[row['snli_id']] = ind

In [31]:
from scipy.stats import entropy
import json

f = open('./data/snli_1.0_train_lalor.jsonl', 'r')

index = 0
    
for line in f:
    data = json.loads(line)   
    
    snli_id = data['pairID'].strip()
    snli_id = snli_id.replace('\t', '')
    # print(snli_id)
    
    df_lalor.at[sample_indices[snli_id], 'pred'] = pred[index]    
    df_lalor.at[sample_indices[snli_id], 'prob_0'] = pred_prob[index][0]
    df_lalor.at[sample_indices[snli_id], 'prob_1'] = pred_prob[index][1]
    df_lalor.at[sample_indices[snli_id], 'prob_2'] = pred_prob[index][2]
    df_lalor.at[sample_indices[snli_id], 'entropy'] = entropy(pred_prob[index], base=2)   
    df_lalor.at[sample_indices[snli_id], 'confidence'] = max(pred_prob[index])   

    index += 1
    
f.close()

In [32]:
conversion_dict = {2: "entailment", 1: "neutral", 0: "contradiction"}
df_lalor['predictions'] = df_lalor['pred'].replace(conversion_dict)

In [33]:
df_lalor.to_csv("./output/snli_tfidf.csv", index=False, header=True)

In [34]:
df_lalor[df_lalor.label == df_lalor.predictions].shape[0]/df_lalor.shape[0]

0.5111111111111111