In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pickle
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

# load the model and the vectorizer from the loaded model
with open('model.pkl', 'rb') as fin:
  vectorizer, clf = pickle.load(fin)


In [3]:
train_distance_label_1 = []
train_distance_label_2 = []

test_distance_label_1 = []
test_distance_label_2 = []


In [5]:
# read the input files
df = pd.read_json("train.jsonl", lines=True)
test = pd.read_json("dev.jsonl", lines=True)


# put in the format that the model recognizes
df_h1 = pd.DataFrame({
    'text': df['obs1'].replace(r'\n', ' ', regex= True) + ' ' + df['obs2'].replace(r'\n', ' ', regex= True) + ' ' + df['hyp1'].replace(r'\n', ' ', regex= True)
})

df_h2 = pd.DataFrame({
    'text': df['obs1'].replace(r'\n', ' ', regex= True) + ' ' + df['obs2'].replace(r'\n', ' ', regex= True) + ' ' + df['hyp2'].replace(r'\n', ' ', regex= True)
})

test_h1 = pd.DataFrame({
    'text': test['obs1'].replace(r'\n', ' ', regex= True) + ' ' + test['obs2'].replace(r'\n', ' ', regex= True) + ' ' + test['hyp1'].replace(r'\n', ' ', regex= True)
})

test_h2 = pd.DataFrame({
    'text': test['obs1'].replace(r'\n', ' ', regex= True) + ' ' + test['obs2'].replace(r'\n', ' ', regex= True) + ' ' + test['hyp2'].replace(r'\n', ' ', regex= True)
})



# cannot process the whole dataset at once, so gotta split it up into batches and concat the results
prev_index = 0
index = 50000
while index < len(df) + 50000:
    if index < len(df):
        h1 = vectorizer.transform(df_h1['text'][prev_index:index]).toarray()
        pred_1 = clf.predict(h1)
        
        h2 = vectorizer.transform(df_h2['text'][prev_index:index]).toarray()
        pred_2 = clf.predict(h2)
    else:
        h1 = vectorizer.transform(df_h1['text'][prev_index:len(df)]).toarray()
        pred_1 = clf.predict(h1)
        
        h2 = vectorizer.transform(df_h2['text'][prev_index:len(df)]).toarray()
        pred_2 = clf.predict(h2)
    
    prev_index = index
    index += 50000
    train_distance_label_1 += pred_1.tolist()
    train_distance_label_2 += pred_2.tolist()

    
h1 = vectorizer.transform(test_h1['text']).toarray()
test_distance_label_1 = clf.predict(h1).tolist()


h2 = vectorizer.transform(test_h2['text']).toarray()
test_distance_label_2 = clf.predict(h2).tolist()


In [4]:
# train_distance_label_1 = pd.DataFrame(train_distance_label_1, columns = ['label'])
# train_distance_label_2 = pd.DataFrame(train_distance_label_2, columns = ['label'])
# test_distance_label_1 = pd.DataFrame(test_distance_label_1, columns = ['label'])
# test_distance_label_2 = pd.DataFrame(test_distance_label_2, columns = ['label'])




In [6]:
train_distance_label_1

[1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [7]:

labels = None
with open("train-labels.lst") as l:
    labels = l.readlines()
    labels = [int(int(line.rstrip()) == 2) for line in labels]

train_labels = pd.DataFrame(labels, columns =['label'])

with open("dev-labels.lst") as l:
    labels = l.readlines()
    labels = [int(int(line.rstrip()) == 2) for line in labels]

dev_labels = pd.DataFrame(labels, columns =['label'])


# process the input data
df_new = pd.DataFrame({
        'text_a': '[CLS] ' + df['obs1'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + df['obs2'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + df['hyp1'].replace(r'\n', ' ', regex= True) + ' [SEP] ' ,#+ "0",# str(train_distance_label_1['label'].replace(r'\n', ' ', regex= True)),
        'text_b': '[CLS] ' + df['obs1'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + df['obs2'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + df['hyp2'].replace(r'\n', ' ', regex = True) + ' [SEP] ' , # + "0",# str(train_distance_label_2['label'].replace(r'\n', ' ', regex= True)),
        'labels': train_labels['label']
    })

    
test_new = pd.DataFrame({ 
        'text_a': '[CLS] ' + test['obs1'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + test['obs2'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + test['hyp1'].replace(r'\n', ' ', regex= True) + ' [SEP] ', #+ #str(test_distance_label_1['label'].replace(r'\n', ' ', regex= True)),
        'text_b': '[CLS] ' + test['obs1'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + test['obs2'].replace(r'\n', ' ', regex= True) + ' [SEP] ' + test['hyp2'].replace(r'\n', ' ', regex = True) + ' [SEP] ', #+ str(test_distance_label_2['label'].replace(r'\n', ' ', regex= True)),
        'labels': dev_labels['label']
    })


In [14]:
for index, row in df_new.iterrows():
    df_new['text_a'][index] = row['text_a'] + " " + str(train_distance_label_1[index])
    df_new['text_b'][index] = row['text_b'] + " " + str(train_distance_label_2[index])

df_new['text_a'][0]
df_new['text_b'][30]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


'[CLS] Brenna and I used to be best friends. [SEP] We never talked again. [SEP] Brenna and I had a great conversation. [SEP]  1'

In [15]:
for index, row in test_new.iterrows():
    test_new['text_a'][index] = row['text_a'] + " " + str(test_distance_label_1[index])
    test_new['text_b'][index] = row['text_b'] + " " + str(test_distance_label_2[index])
    
test_new['text_a'][0]
test_new['text_b'][30]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


'[CLS] I used to procrastinate about studying. [SEP] Now, I never procrastinate studying. [SEP] After getting a good grade, I learned an easy lesson. [SEP]  1'

In [20]:
df_new['text_a'][0] = df_new['text_a'][0][:-3]
df_new['text_a'][0]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


'[CLS] Chad went to get the wheel alignment measured on his car. [SEP] The mechanic provided a working alignment with new body work. [SEP] Chad was waiting for his car to be washed. [SEP]  1'

In [21]:
# dump into pickles
df_new.to_pickle('train.pkl')
test_new.to_pickle('dev.pkl')

In [22]:
# test the pickle by loading it
unpickled_df = pd.read_pickle("train.pkl")
unpickled_df['text_a'][0]

'[CLS] Chad went to get the wheel alignment measured on his car. [SEP] The mechanic provided a working alignment with new body work. [SEP] Chad was waiting for his car to be washed. [SEP]  1'