# Install and Load libraries

In [1]:
# Install trax

!pip install trax

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting trax
  Downloading trax-1.4.1-py2.py3-none-any.whl (637 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m637.9/637.9 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-text
  Downloading tensorflow_text-2.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
Collecting funcsigs
  Downloading funcsigs-1.0.2-py2.py3-none-any.whl (17 kB)
Installing collected packages: funcsigs, tensorflow-text, trax
Successfully installed funcsigs-1.0.2 tensorflow-text-2.12.1 trax-1.4.1


In [2]:
# import libraries

import os
import nltk
nltk.download('punkt')

import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp
import numpy as np
import pandas as pd
import random as rnd
from trax import shapes
from collections import defaultdict

# set random seeds
rnd.seed(34)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Mount google colab

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preperation 

In [4]:
# dataset

data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/COMP0087-NLP/rephrased_first_5750.csv")
N=len(data)
print('Number of question pairs: ', N)

Number of question pairs:  5750


In [6]:
data['is_duplicate'] = 1
data

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,original_chatgpt,rephrased_chatgpt,is_duplicate
0,0,0.0,0,0,There are many different best seller lists tha...,The New York Times best seller list is one of ...,1
1,1,1.0,1,1,Salt is used on roads to help melt ice and sno...,Salt is widely used to melt ice and snow on ro...,1
2,2,2.0,2,2,There are a few reasons why we still have SD (...,Since some people still use older TVs that can...,1
3,3,3.0,3,3,It is generally not acceptable or ethical to a...,In light of the severe consequences that assas...,1
4,4,4.0,4,4,After the Wright Brothers made the first power...,After the Wright Brothers made the first power...,1
...,...,...,...,...,...,...,...
5745,5745,,245,5745,The human eye and brain work together to proce...,The human eye and brain work in tandem to proc...,1
5746,5746,,246,5746,"Before the invention of alarm clocks, people u...","Before the invention of alarm clocks, people h...",1
5747,5747,,247,5747,Death is often depicted as a skeleton or a per...,The image of Death as a skeleton or person wea...,1
5748,5748,,248,5748,"On Reddit, there is a community of people who ...",The Reddit community of people interested in c...,1


In [7]:
# Split into data_train amd data_test

N_train = 4600
N_test  = 1150
data_train = data[:N_train]
data_test  = data[N_train:N_train+N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del(data) # remove to free memory

Train set: 4600 Test set: 1150


In [8]:
# Select only the pairs that are duplicate to train the model.

td_index = (data_train['is_duplicate'] == 1).to_numpy()
td_index = [i for i, x in enumerate(td_index) if x] 
print('number of duplicate questions: ', len(td_index))
print('indexes of first ten duplicate questions:', td_index[:10])

number of duplicate questions:  4600
indexes of first ten duplicate questions: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [9]:
Q1_train_words = np.array(data_train['original_chatgpt'][td_index])
Q2_train_words = np.array(data_train['rephrased_chatgpt'][td_index])

Q1_test_words = np.array(data_test['original_chatgpt'])
Q2_test_words = np.array(data_test['rephrased_chatgpt'])
y_test  = np.array(data_test['is_duplicate'])

In [10]:
#create arrays
Q1_train = np.empty_like(Q1_train_words)
Q2_train = np.empty_like(Q2_train_words)

Q1_test = np.empty_like(Q1_test_words)
Q2_test = np.empty_like(Q2_test_words)

In [11]:
# Building the vocabulary with the train set        

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

for idx in range(len(Q1_train_words)):
    Q1_train[idx] = nltk.word_tokenize(Q1_train_words[idx][0])
    Q2_train[idx] = nltk.word_tokenize(Q2_train_words[idx][0])
    q = Q1_train[idx] + Q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print('The length of the vocabulary is: ', len(vocab))

The length of the vocabulary is:  41


In [12]:
print(vocab['<PAD>'])
print(vocab['Astrology'])
print(vocab['Astronomy'])  #not in vocabulary, returns 0

1
0
0


In [13]:
for idx in range(len(Q1_test_words)): 
    Q1_test[idx] = nltk.word_tokenize(Q1_test_words[idx][0])
    Q2_test[idx] = nltk.word_tokenize(Q2_test_words[idx][0])

In [14]:
# Converting questions to array of integers
for i in range(len(Q1_train)):
    Q1_train[i] = [vocab[word] for word in Q1_train[i]]
    Q2_train[i] = [vocab[word] for word in Q2_train[i]]

        
for i in range(len(Q1_test)):
    Q1_test[i] = [vocab[word] for word in Q1_test[i]]
    Q2_test[i] = [vocab[word] for word in Q2_test[i]]

In [16]:
# Splitting the data

cut_off = int(len(Q1_train)*.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off: ], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  4600
The length of the training set is:   3680
The length of the validation set is:  920


# Main Implementation

In [17]:
# Implement the data generator 

def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True):

    input1 = []
    input2 = []
    idx = 0
    len_q = len(Q1)
    question_indexes = [*range(len_q)]
    
    if shuffle:
        rnd.shuffle(question_indexes)
    
    while True:
        if idx >= len_q:
            # if idx is greater than or equal to len_q, set idx accordingly 
            idx = len_q
            # shuffle to get random batches if shuffle is set to True
            if shuffle:
                rnd.shuffle(question_indexes) 
        
        # get questions at the `question_indexes[idx]` position in Q1 and Q2
        q1 = Q1[question_indexes[idx]]
        q2 = Q2[question_indexes[idx]]
        
        # increment idx by 1
        idx += 1
        # append q1
        input1.append(q1)
        # append q2
        input2.append(q2)
        if len(input1) == batch_size:
            # determine max_len as the longest question in input1 & input 2
            # take max of input1 & input2 and then max out of the two of them.
            max_len = max(max([len(q) for q in input1]), max([len(q) for q in input2]))
            # pad to power-of-2 
            max_len = 2**int(np.ceil(np.log2(max_len)))
            b1 = [] 
            b2 = [] 
            for q1, q2 in zip(input1, input2):
                # add [pad] to q1 until it reaches max_len
                q1 = q1 + [pad] * (max_len - len(q1))
                # add [pad] to q2 until it reaches max_len
                q2 = q2 + [pad] * (max_len - len(q2))               
                # append q1
                b1.append(q1)
                # append q2
                b2.append(q2)
            # use b1 and b2
            yield np.array(b1), np.array(b2)
            # reset the batches
            input1, input2 = [], []  # reset the batches

In [20]:
# Implement the Siamese function

def Siamese(vocab_size=len(vocab), d_model=128, mode='train'):

    def normalize(x):  # normalizes the vectors to have L2 norm 1
        return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))
    
    q_processor = tl.Serial( # Processor will run on Q1 and Q2. 
        tl.Embedding(vocab_size, d_model), # Embedding layer
        tl.LSTM(d_model), # LSTM layer
        tl.Mean(axis=1), # Mean over columns
        tl.Fn('Normalize', lambda x: normalize(x)), # Apply normalize function
    )  # Returns one vector of shape [batch_size, d_model]. 
    
    
    # Run on Q1 and Q2 in parallel.
    model = tl.Parallel(q_processor, q_processor)
    return model

In [21]:
# show the Siamese model architecture

model = Siamese()
print(model)

Parallel_in2_out2[
  Serial[
    Embedding_45_128
    LSTM_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_45_128
    LSTM_128
    Mean
    Normalize
  ]
]


In [22]:
#  Implement the TripletLoss

def TripletLossFn(v1, v2, margin=0.25):
    
    # use fastnp to take the dot product of the two batches 
    scores = fastnp.dot(v1, v2.T) # pairwise cosine sim    
    # calculate new batch size
    batch_size = len(scores)
    # use fastnp to grab all postive `diagonal` entries in `scores`
    positive = fastnp.diagonal(scores)  # the positive ones (duplicates)
    # subtract `fastnp.eye(batch_size)` out of 1.0 and do element-wise multiplication with `scores`
    negative_zero_on_duplicate = (1 - fastnp.eye(batch_size)) * scores
    # use `fastnp.sum` on `negative_zero_on_duplicate` for `axis=1` and divide it by `(batch_size - 1)`
    mean_negative = fastnp.sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)
    # create a composition of two masks: 
    # the first mask to extract the diagonal elements, 
    # the second mask to extract elements in the negative_zero_on_duplicate matrix that are larger than the elements in the diagonal 
    mask_exclude_positives = (fastnp.identity(batch_size) == 1)|(negative_zero_on_duplicate > positive.reshape(batch_size, 1))
    # multiply `mask_exclude_positives` with 2.0 and subtract it out of `negative_zero_on_duplicate`
    negative_without_positive = negative_zero_on_duplicate - mask_exclude_positives * 2
    # take the row by row `max` of `negative_without_positive`. 
    closest_negative = negative_without_positive.max(axis = 1) 
    # compute `fastnp.maximum` among 0.0 and `A`
    # where A = subtract `positive` from `margin` and add `closest_negative`
    triplet_loss1 = fastnp.maximum(0.0, margin - positive + closest_negative)
    # compute `fastnp.maximum` among 0.0 and `B`
    # where B = subtract `positive` from `margin` and add `mean_negative`
    triplet_loss2 = fastnp.maximum(0.0, margin - positive + mean_negative)
    # add the two losses together and take the `fastnp.sum` of it    
    triplet_loss = fastnp.sum(triplet_loss1 + triplet_loss2)
    
    return triplet_loss

In [23]:
from functools import partial
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

In [24]:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size, vocab['<PAD>'])
val_generator = data_generator(val_Q1, val_Q2, batch_size, vocab['<PAD>'])
print('train_Q1.shape ', train_Q1.shape)
print('val_Q1.shape   ', val_Q1.shape)

train_Q1.shape  (3680,)
val_Q1.shape    (920,)


In [25]:
# Implement the train_model to train the Siamese neural network 

def train_model(Siamese, TripletLoss, train_generator, val_generator, output_dir='model/'):
    
    output_dir = os.path.expanduser(output_dir)

    train_task = training.TrainTask( 
        labeled_data=train_generator,      # Use generator (train)
        loss_layer=TripletLoss(),        # Use triplet loss. Don't forget to instantiate this object
        optimizer=trax.optimizers.Adam(0.01),         # Don't forget to add the learning rate parameter
        lr_schedule=trax.lr.warmup_and_rsqrt_decay(400, 0.01) # Use Trax multifactor schedule function
    )

    eval_task = training.EvalTask(
        labeled_data=val_generator,      # Use generator (val)
        metrics=[TripletLoss()],         # Use triplet loss. Don't forget to instantiate this object
    )
    
    training_loop = training.Loop(Siamese(),
                                  train_task,
                                  eval_tasks=[eval_task],
                                  output_dir=output_dir)

    return training_loop

In [26]:
train_steps = 5
training_loop = train_model(Siamese, TripletLoss, train_generator, val_generator)
training_loop.run(train_steps)




Step      1: Total number of trainable weights: 137344
Step      1: Ran 1 train steps in 1.65 secs
Step      1: train TripletLoss |  127.89276886


  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:
  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  TripletLoss |  127.87814331


In [27]:
model = Siamese()
# filename = 'model.pkl.gz'
model.init_from_file(file_name='/content/model/model.pkl.gz', weights_only=True, input_signature=shapes.signature(next(train_generator)))

(((array([[ 0.09971379, -0.0864032 , -0.07706386, ..., -0.1401512 ,
           -0.04404699,  0.12256428],
          [-0.14028782,  0.14415507, -0.04838157, ..., -0.04077215,
           -0.03859194, -0.13205117],
          [-0.10755897,  0.10958517,  0.09966225, ..., -0.1526314 ,
            0.12377943,  0.03611372],
          ...,
          [-0.08992466,  0.12010487, -0.08379889, ...,  0.07056148,
           -0.02370089, -0.08870141],
          [ 0.09188224,  0.08501628, -0.05786876, ..., -0.14007156,
           -0.05122451, -0.03764261],
          [ 0.00919677, -0.08141428, -0.00316356, ..., -0.07539246,
            0.03076359,  0.12665074]], dtype=float32),
   (((), ((), ())),
    ((array([[-0.08329147,  0.01269902,  0.00159767, ...,  0.00076349,
              -0.07399829, -0.03258217],
             [ 0.05096844, -0.06922547, -0.01389691, ..., -0.04573439,
              -0.0037385 ,  0.06335484],
             [ 0.07469929, -0.03557743, -0.05200601, ..., -0.04929484,
              -0.

In [28]:
# Calculate the cosine similarity score as y_score

def y_score(question1, question2, model, vocab, data_generator=data_generator, verbose=False):

    # use `nltk` word tokenize function to tokenize
    q1 = nltk.word_tokenize(question1)  # tokenize
    q2 = nltk.word_tokenize(question2)  # tokenize
    Q1, Q2 = [], []
    for word in q1:  # encode q1
        # append the 'word' index in `vocab`
        Q1.append(vocab[word])
    for word in q2:  # encode q2
        # append the 'word' index in `vocab`
        Q2.append(vocab[word])
        
    # Call the data generator (built in Ex 01) using next()
    # pass [Q1] & [Q2] as Q1 & Q2 arguments of the data generator. Set batch size as 1
    Q1, Q2 = next(data_generator([Q1], [Q2], 1, vocab['<PAD>']))
    # Call the model
    v1, v2 = model((Q1, Q2))
    # take dot product to compute cos similarity of each pair of entries, v1, v2
    # don't forget to transpose the second argument
    y = np.dot(v1, v2.T)
    if y != 1:
      y = np.log10(1 / np.abs(1 - y))
    else:
      y = np.log10(1 / np.abs(1 - y + 0.0000001))
    

    return y

# human_gpt Cosine Similarity

In [30]:
human_answer = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/COMP0087-NLP/human_rephrased_first_1000.csv")
human_answer['y_score'] = 0
human_answer

  and should_run_async(code)


Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,original_human,rephrased_human,y_score
0,0,0,0,0,"Basically there are many categories of "" Best ...","Basically, similar to the Oscars, if a book is...",0
1,1,1,1,1,salt is good for not dying in car crashes and ...,Salt is an inexpensive and effective way to pr...,0
2,2,2,2,2,The way it works is that old TV stations got a...,Old TV stations were given a certain amount of...,0
3,3,3,3,3,You ca n't just go around assassinating the le...,Even though no one likes Kim Jong-Un and North...,0
4,4,4,4,4,Wanting to kill the shit out of Germans drives...,The motivating desire to defeat the Germans in...,0
...,...,...,...,...,...,...,...
995,995,995,295,995,The data is on your hard drive and when you de...,"If you delete data stored on your hard drive, ...",0
996,996,996,296,996,"Okay , so I think everyone is different . For ...",When I'm nervous or stressed I tend to experie...,0
997,997,997,297,997,Expense and privacy concerns . If a police off...,Given the expense and privacy issues that aris...,0
998,998,998,298,998,"[ Here 's Kevin James ] ( URL_2 ) , and [ here...",Kevin James (URL_2) and his love interest in P...,0


In [31]:
for i in range(len(human_answer['original_human'])):
  human = human_answer['original_human'][i]
  human_gpt = human_answer['rephrased_human'][i]
  cos_sim = y_score(human, human_gpt, model, vocab)
  human_answer['y_score'][i] = cos_sim


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  human_answer['y_score'][i] = cos_sim


In [32]:
human_answer

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,original_human,rephrased_human,y_score
0,0,0,0,0,"Basically there are many categories of "" Best ...","Basically, similar to the Oscars, if a book is...",4.803116
1,1,1,1,1,salt is good for not dying in car crashes and ...,Salt is an inexpensive and effective way to pr...,5.476532
2,2,2,2,2,The way it works is that old TV stations got a...,Old TV stations were given a certain amount of...,4.515450
3,3,3,3,3,You ca n't just go around assassinating the le...,Even though no one likes Kim Jong-Un and North...,4.456822
4,4,4,4,4,Wanting to kill the shit out of Germans drives...,The motivating desire to defeat the Germans in...,5.017894
...,...,...,...,...,...,...,...
995,995,995,295,995,The data is on your hard drive and when you de...,"If you delete data stored on your hard drive, ...",5.160262
996,996,996,296,996,"Okay , so I think everyone is different . For ...",When I'm nervous or stressed I tend to experie...,4.369201
997,997,997,297,997,Expense and privacy concerns . If a police off...,Given the expense and privacy issues that aris...,4.719570
998,998,998,298,998,"[ Here 's Kevin James ] ( URL_2 ) , and [ here...",Kevin James (URL_2) and his love interest in P...,4.718215


In [33]:
# saving the dataframe
human_answer.to_csv('/content/drive/MyDrive/Colab Notebooks/COMP0087-NLP/huamn_gpt_cos_similarity.csv')

  and should_run_async(code)


# chatgpt_gpt Cosine Similarity

In [34]:
chatgpt_answer = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/COMP0087-NLP/rephrased_first_1k.csv")
chatgpt_answer['y_score'] = 0
chatgpt_answer

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,original_chatgpt,rephrased_chatgpt,y_score
0,0,0,0,There are many different best seller lists tha...,The New York Times best seller list is one of ...,0
1,1,1,1,Salt is used on roads to help melt ice and sno...,Salt is widely used to melt ice and snow on ro...,0
2,2,2,2,There are a few reasons why we still have SD (...,Since some people still use older TVs that can...,0
3,3,3,3,It is generally not acceptable or ethical to a...,In light of the severe consequences that assas...,0
4,4,4,4,After the Wright Brothers made the first power...,After the Wright Brothers made the first power...,0
...,...,...,...,...,...,...
995,995,345,995,"When you delete a file from your computer, pho...","When you delete a file from your computer, pho...",0
996,996,346,996,Lucid dreaming is the act of being aware that ...,"By keeping a dream journal, performing reality...",0
997,997,347,997,There are a few reasons why some people might ...,Some people may not support the use of body ca...,0
998,998,348,998,There are a few reasons why there may not be a...,The fashion industry often pushes cultural bea...,0


In [35]:
for i in range(len(chatgpt_answer['original_chatgpt'])):
  chatgpt = chatgpt_answer['original_chatgpt'][i]
  chatgpt_gpt = chatgpt_answer['rephrased_chatgpt'][i]
  cos_sim = y_score(chatgpt, chatgpt_gpt, model, vocab)
  chatgpt_answer['y_score'][i] = cos_sim


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chatgpt_answer['y_score'][i] = cos_sim


In [36]:
chatgpt_answer

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,original_chatgpt,rephrased_chatgpt,y_score
0,0,0,0,There are many different best seller lists tha...,The New York Times best seller list is one of ...,3.895729
1,1,1,1,Salt is used on roads to help melt ice and sno...,Salt is widely used to melt ice and snow on ro...,3.934908
2,2,2,2,There are a few reasons why we still have SD (...,Since some people still use older TVs that can...,4.136584
3,3,3,3,It is generally not acceptable or ethical to a...,In light of the severe consequences that assas...,4.287704
4,4,4,4,After the Wright Brothers made the first power...,After the Wright Brothers made the first power...,3.957314
...,...,...,...,...,...,...
995,995,345,995,"When you delete a file from your computer, pho...","When you delete a file from your computer, pho...",4.414487
996,996,346,996,Lucid dreaming is the act of being aware that ...,"By keeping a dream journal, performing reality...",3.531169
997,997,347,997,There are a few reasons why some people might ...,Some people may not support the use of body ca...,3.856247
998,998,348,998,There are a few reasons why there may not be a...,The fashion industry often pushes cultural bea...,3.882692


In [37]:
# saving the dataframe
chatgpt_answer.to_csv('/content/drive/MyDrive/Colab Notebooks/COMP0087-NLP/chatgpt_gpt_cos_similarity.csv')

  and should_run_async(code)
