In [22]:
import fasttext
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout

# Have to download the stopwords
# nltk.download('stopwords')

In [3]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')



In [105]:
# Creates the stopwords
to_stop = stopwords.words('english')
punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
for c in punctuation:
    to_stop.append(c)

to_stop.append('null')
print(to_stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [145]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_computers = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    for row in orig_data.itertuples():
        title_left = row.title_left.split(' ')
        title_right = row.title_right.split(' ')
        
        # Creates a new list of only elements that are not in the stop words
        temp_title_left = []
        for word in title_left:
            if word not in to_stop:
                temp_title_left.append(word)
                
        # Creates a new list of only elements that are not in the stop words
        temp_title_right = []
        for word in title_right:
            if word not in to_stop:
                temp_title_right.append(word)
        
        # Join the elements in the list to create the strings
        title_left = ' '.join(temp_title_left)
        title_right = ' '.join(temp_title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_computers = norm_computers.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
        
    return norm_computers
        

In [144]:
def create_simple_data():
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    # Get the dataset of computer parts
    computers_df = pd.read_json('data/computers_train/computers_train_xlarge_normalized.json.gz',compression='gzip', lines=True)
    norm_computers = preprocessing(computers_df)
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_computers.to_csv('data/computers_train/computers_train_xlarge_norm_simple.csv', index=False)

In [4]:
# Load the data
computer_df = pd.read_csv('data/computers_train/computers_train_xlarge_norm_simple.csv')

In [24]:
computer_df.head()

Unnamed: 0,title_one,title_two,label
0,hp intel xeon x5560 prijzen tweakers,495906 b21 hp x5560 2 80ghz ml350 g6,1
1,495906 b21 hp x5560 2 80ghz ml350 g6 new whole...,495906 b21 hp x5560 2 80ghz ml350 g6,1
2,asus motherboard lga2066 ddr4 2 u atx 2xgbe pr...,asus prime x299 deluxe,1
3,asus prime x299 deluxe prijzen tweakers,asus prime x299 deluxe socket 2066 intel atx m...,1
4,asus prime x299 deluxe,asus prime x299 deluxe socket lga2066 motherbo...,1


## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer

In [40]:
def siamese_network(input_shape):
    # Defines our inputs
    left_title = Input(input_shape, dtype='float32')
    right_title = Input(input_shape, dtype='float32')
    
    # The LSTM units
    model = tf.keras.Sequential(name='siamese_network')
    model.add(LSTM(units=256, return_sequences=True, name='lstm_1'))
    model.add(Dropout(rate=0.3))
    model.add(LSTM(units=128, return_sequences=True, name='lstm_2'))
    model.add(Dropout(rate=0.3))
    model.add(LSTM(units=128, name='lstm_3'))
    model.add(Dropout(rate=0.3))
    
    # The dense layers
    model.add(Dense(units=1024, activation='relu', name='dense_1'))
    model.add(Dropout(rate=0.3))
    model.add(Dense(units=512, activation='relu', name='dense_2'))
    
    # Forward propagate through the model to generate the encodings
    encoded_left_title = model(left_title)
    encoded_right_title = model(right_title)
    
    # Create and return the network
    siamese_net = tf.keras.Model(inputs=[left_title, right_title], outputs=[encoded_left_title, encoded_right_title])
    return siamese_net

In [41]:
max_len = 10
EMBEDDING_SIZE = 300
model = siamese_network((max_len, EMBEDDING_SIZE))
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           [(None, 10, 300)]    0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, 10, 300)]    0                                            
__________________________________________________________________________________________________
siamese_network (Sequential)    (None, 512)          1555968     input_15[0][0]                   
                                                                 input_16[0][0]                   
Total params: 1,555,968
Trainable params: 1,555,968
Non-trainable params: 0
__________________________________________________________________________________________________


In [18]:
def get_max_len():
    max_len = 0
    for row in computer_df.itertuples():
        if len(row.title_one.split(' ')) > max_len:
            max_len = len(row.title_one.split(' '))
            
        if len(row.title_two.split(' ')) > max_len:
            max_len = len(row.title_two.split(' '))
    
    return max_len


In [25]:
"""
Definitions of some sizes in the training set
"""
# Returns the max word length in the entire training set
MAX_LEN = get_max_len()
EMBEDDING_SHAPE = fasttext_model['queen'].shape
m = len(computer_df)
print('MAX_LEN: ' + str(MAX_LEN), 'EMBEDDING_SHAPE: ' + str(EMBEDDING_SHAPE), 'm: ' + str(m))

MAX_LEN: 42 EMBEDDING_SHAPE: (300,) m: 68461


In [39]:
"""
Create the numpy files of all the training embedddings
We will have two numpy files:
1. The training/validation/test sets
2. The labels
"""

def create_embeddings(df):
    # Create the numpy arrays for storing the embeddings and labels
    total_embeddings = np.zeros(shape=(m, 2, MAX_LEN, EMBEDDING_SHAPE[0]))
    labels = np.zeros(shape=(m))
    
    # I know this is a terrible way of doing this, but iterate over the dataframe
    # and generate the embeddings to add to the numpy array
    for idx, row in enumerate(computer_df.itertuples()):
        for word_idx, word in enumerate(row.title_one.split(' ')):
            total_embeddings[idx, 0, word_idx] = fasttext_model[word]
            
        for word_idx, word in enumerate(row.title_two.split(' ')):
            total_embeddings[idx, 1, word_idx] = fasttext_model[word]
            
        labels[idx] = row.label
        
    return total_embeddings, labels


In [40]:
def save_embeddings():
    embeddings, labels = create_embeddings(computer_df)
    with open('data/computers_numpy/embeddings.npy', 'wb') as f:
        np.save(f, embeddings)

    with open('data/computers_numpy/labels.npy', 'wb') as f:
        np.save(f, labels)

In [43]:
def load_embeddings_and_labels():
    loaded_embeddings = None
    labels = None
    with open('data/computers_numpy/embeddings.npy', 'rb') as f:
        loaded_embeddings = np.load(f)
    
    with open('data/computers_numpy/labels.npy', 'rb') as f:
        labels = np.load(f)
    
    return loaded_embeddings, labels

In [44]:
embeddings, labels = load_embeddings_and_labels()
print('Embeddings shape: ' + str(embeddings.shape), 'Labels shape: ' + str(labels.shape))

Embeddings shape: (68461, 2, 42, 300) Labels shape: (68461,)
