In [43]:
import fasttext
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, Lambda, GRU

# Have to download the stopwords
# nltk.download('stopwords')

In [44]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')



In [None]:
# Creates the stopwords
to_stop = stopwords.words('english')
punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
for c in punctuation:
    to_stop.append(c)

to_stop.append('null')
print(to_stop)

In [None]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_computers = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    for row in orig_data.itertuples():
        title_left = row.title_left.split(' ')
        title_right = row.title_right.split(' ')
        
        # Creates a new list of only elements that are not in the stop words
        temp_title_left = []
        for word in title_left:
            if word not in to_stop:
                temp_title_left.append(word)
                
        # Creates a new list of only elements that are not in the stop words
        temp_title_right = []
        for word in title_right:
            if word not in to_stop:
                temp_title_right.append(word)
        
        # Join the elements in the list to create the strings
        title_left = ' '.join(temp_title_left)
        title_right = ' '.join(temp_title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_computers = norm_computers.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
        
    return norm_computers
        

In [None]:
def create_simple_data():
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    # Get the dataset of computer parts
    computers_df = pd.read_json('data/computers_train/computers_train_xlarge_normalized.json.gz',compression='gzip', lines=True)
    norm_computers = preprocessing(computers_df)
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_computers.to_csv('data/computers_train/computers_train_xlarge_norm_simple.csv', index=False)

In [None]:
# Load the data
computer_df = pd.read_csv('data/computers_train/computers_train_xlarge_norm_simple.csv')

In [None]:
computer_df.head()

## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer

In [64]:
def euclidean_distance(vectors):
    x, y = vectors
    return tf.square(x - y)

def euclidean_dist_out_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],)

def siamese_network(input_shape):
    # Defines our inputs
    left_title = Input(input_shape, dtype='float32')
    right_title = Input(input_shape, dtype='float32')
    
    # The LSTM units
    model = tf.keras.Sequential(name='siamese_model')
    model.add(LSTM(units=256, return_sequences=True, name='lstm_1'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, return_sequences=True, name='lstm_2'))
    model.add(Dropout(rate=0.5))
    model.add(LSTM(units=128, name='lstm_3'))
    model.add(Dropout(rate=0.5))
    
    # The dense layers
    model.add(Dense(units=1024, activation='relu', name='dense_1'))
    model.add(Dropout(rate=0.5))
    model.add(Dense(units=512, activation='relu', name='dense_2'))
    
    # Forward propagate through the model to generate the encodings
    encoded_left_title = model(left_title)
    encoded_right_title = model(right_title)

    Square_distance_layer = Lambda(euclidean_distance)
    distance = Square_distance_layer([encoded_left_title, encoded_right_title])
    
    prediction = Dense(1, activation='sigmoid')(distance)
    # Create and return the network
    siamese_net = tf.keras.Model(inputs=(left_title, right_title), outputs=prediction, name='siamese_network')
    return siamese_net

In [None]:
euclidean_distance([np.array([[4, 5, 6], [5, 6, 7], [11, 234, 12]]), np.array([[5, 6, 7], [10, 11, 12], [11, 234, 12]])])
#np.array([[4, 5, 6], [5, 6, 7]]).shape

In [None]:
def get_max_len():
    max_len = 0
    for row in computer_df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len


In [52]:
"""
Create the numpy files of all the training embedddings
We will have two numpy files:
1. The training/validation/test sets
2. The labels
"""

def create_embeddings(df):
    # Create the numpy arrays for storing the embeddings and labels
    total_embeddings = np.zeros(shape=(m, 2, MAX_LEN, EMBEDDING_SHAPE[0]))
    labels = np.zeros(shape=(m))
    
    # I know this is a terrible way of doing this, but iterate over the dataframe
    # and generate the embeddings to add to the numpy array
    for idx, row in enumerate(computer_df.itertuples()):
        for word_idx, word in enumerate(row.title_one.split(' ')):
            total_embeddings[idx, 0, word_idx] = fasttext_model[word]
            
        for word_idx, word in enumerate(row.title_two.split(' ')):
            total_embeddings[idx, 1, word_idx] = fasttext_model[word]
            
        labels[idx] = row.label
        
    return total_embeddings, labels


In [53]:
def save_embeddings():
    embeddings, labels = create_embeddings(computer_df)
    with open('data/computers_numpy/embeddings.npy', 'wb') as f:
        np.save(f, embeddings)

    with open('data/computers_numpy/labels.npy', 'wb') as f:
        np.save(f, labels)

In [54]:
def load_embeddings_and_labels():
    loaded_embeddings = None
    labels = None
    rng = np.random.default_rng()
    with open('data/computers_numpy/embeddings.npy', 'rb') as f:
        loaded_embeddings = np.load(f)
        rng.shuffle(loaded_embeddings)
        loaded_embeddings = np.transpose(loaded_embeddings, (1, 0, 2, 3))
    
    with open('data/computers_numpy/labels.npy', 'rb') as f:
        labels = np.load(f)
    
    return loaded_embeddings, labels

In [55]:
embeddings, labels = load_embeddings_and_labels()
print('Embeddings shape: ' + str(embeddings.shape), 'Labels shape: ' + str(labels.shape))

Embeddings shape: (2, 68461, 42, 300) Labels shape: (68461,)


In [46]:
"""
Definitions of some sizes in the training set
"""
# Returns the max word length in the entire training set
margin = 2.0
MAX_LEN = 42
#EMBEDDING_SHAPE = fasttext_model['queen'].shape
EMBEDDING_SHAPE = (300,)
#m = embeddings.shape[1]
m = 68461
print('MAX_LEN: ' + str(MAX_LEN), 'EMBEDDING_SHAPE: ' + str(EMBEDDING_SHAPE), 'm: ' + str(m))

MAX_LEN: 42 EMBEDDING_SHAPE: (300,) m: 68461


In [None]:
# Note: for the constrastive loss, because 0 denotes that they are from the same class
# and one denotes they are from a different class, I swaped the (Y) and (1 - Y) terms

# Just chose this
def constrastive_loss(y_true, y_pred):
    d = y_pred
    d_sqrt = tf.sqrt(d)
    #tf.print('\nY Pred: ', d, 'Shape: ', tf.shape(d))
    #tf.print('\nY True: ', y_true, 'Shape: ', tf.shape(y_true))
    
    loss = (y_true * d) + ((1 - y_true) * tf.square(tf.maximum(0., margin - d_sqrt)))
    
    #tf.print('\n Constrastive Loss: ', loss, 'Shape: ', tf.shape(loss))
    loss = 0.5 * tf.reduce_mean(loss)
    
    return loss

In [None]:
# Previous testing code
constrastive_loss(np.array([1, 0, 0], dtype=np.float32), np.array([0, 0, 1], dtype=np.float32))

In [None]:
# Accuracy metric for constrastive loss because values close to 0 are equal and values high are different
# 0.5 is the threshold here
def constrastive_accuracy(y_true, y_pred):
    return tf.reduce_mean(tf.cast(tf.equal(y_true, tf.cast(y_pred < 0.5, y_true.dtype)), y_true.dtype))

In [None]:
constrastive_accuracy(np.array([1, 1, 0], dtype=np.float32), np.array([0.3, 1.324, 0.9], dtype=np.float32))

In [77]:
X_train1 = embeddings[0, :50000]
X_train2 = embeddings[1, :50000]
X_train = np.stack((X_train1, X_train2))
print('Training shape: ' + str(X_train.shape))

X_val1 = embeddings[0, 50000:59000]
X_val2 = embeddings[1, 50000:59000]
X_val = np.stack((X_val1, X_val2))
print('Val shape: ' + str(X_val.shape))

X_test1 = embeddings[0, 59000:]
X_test2 = embeddings[1, 59000:]
X_test = np.stack((X_test1, X_test2))
print('Test shape: ' + str(X_test.shape))

Training shape: (2, 50000, 42, 300)
Val shape: (2, 9000, 42, 300)
Test shape: (2, 9461, 42, 300)


In [None]:
del embeddings

In [78]:
Y_train = labels[:50000]
Y_val = labels[50000:59000]
Y_test = labels[59000:]

In [None]:
del labels

In [89]:
model = siamese_network((MAX_LEN, EMBEDDING_SHAPE[0],))
model.summary()

Model: "siamese_network"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
siamese_model (Sequential)      (None, 512)          1555968     input_9[0][0]                    
                                                                 input_10[0][0]                   
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, 512)          0           siamese_model[0][0]

In [90]:
lr = 0.001
opt = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.fit(x=[X_train1, X_train2], y=Y_train, batch_size=16, epochs=10, validation_data=([X_val[0], X_val[1]], Y_val))

Train on 50000 samples, validate on 9000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

In [91]:
results = model.evaluate([X_test1, X_test2], Y_test, batch_size=64)
print('test loss, test acc: ', results)

test loss, test acc:  [0.23220132291316986, 1.0]


In [75]:
title_one = 'amd ryzen 7 3700x'
title_two = 'intel core i7 7700k'
title_one_arr = np.zeros((1, 42, 300))
title_two_arr = np.zeros((1, 42, 300))

for idx, word in enumerate(title_one.split(' ')):
    title_one_arr[0, idx] = fasttext_model[word]
    
for idx, word in enumerate(title_two.split(' ')):
    title_two_arr[0, idx] = fasttext_model[word]

In [72]:
model.load_weights('models/test.h5')

In [49]:
model.summary()

Model: "siamese_network"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 42, 300)]    0                                            
__________________________________________________________________________________________________
siamese_model (Sequential)      (None, 512)          1555968     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 512)          0           siamese_model[0][0]

In [76]:
model.predict([title_one_arr, title_two_arr])

array([[0.5]], dtype=float32)

In [71]:
model.predict([title_one_arr, title_two_arr])

array([[0.5]], dtype=float32)

In [40]:
title_one_arr[0]

array([[-0.00112235,  0.02290357,  0.08181914, ...,  0.033554  ,
         0.00823831,  0.00227902],
       [-0.01170767,  0.01499494,  0.03210003, ...,  0.03566911,
        -0.00559789,  0.07630333],
       [ 0.04335381, -0.03847458, -0.18884453, ..., -0.13215435,
         0.0932787 ,  0.1488912 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [41]:
title_two_arr[0]

array([[-0.01170767,  0.01499494,  0.03210003, ...,  0.03566911,
        -0.00559789,  0.07630333],
       [ 0.04335381, -0.03847458, -0.18884453, ..., -0.13215435,
         0.0932787 ,  0.1488912 ],
       [-0.028708  ,  0.00284695,  0.04317917, ..., -0.02172378,
        -0.02828848,  0.00295764],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [101]:
np.sum(Y_train == 1)

9690

In [103]:
rng = np.random.default_rng()


In [105]:
diff = rng.shuffle(labels)

In [109]:
np.sum(labels[59000:] == 1)

1306