In [9]:
import fasttext
import pandas as pd
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input

# Have to download the stopwords
# nltk.download('stopwords')

In [10]:
# Get the fasttext model (we are using the largest one they offer [600B tokens])
fasttext_model = fasttext.load_model('models/crawl-300d-2M-subword.bin')



In [105]:
# Creates the stopwords
to_stop = stopwords.words('english')
punctuation = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~ "
for c in punctuation:
    to_stop.append(c)

to_stop.append('null')
print(to_stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [145]:
# Organizing and normalizing the data
"""
Essentially, we want to only have three attributes for each training example: title_one, title_two, label
For normalization, we are just going to use the nltk stopwords and punctuation
"""

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    
    # The new names of the columns
    column_names = ['title_one', 'title_two', 'label']
    # A new dataframe for the data we are going to be creating
    norm_computers = pd.DataFrame(columns = column_names)
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    for row in orig_data.itertuples():
        title_left = row.title_left.split(' ')
        title_right = row.title_right.split(' ')
        
        # Creates a new list of only elements that are not in the stop words
        temp_title_left = []
        for word in title_left:
            if word not in to_stop:
                temp_title_left.append(word)
                
        # Creates a new list of only elements that are not in the stop words
        temp_title_right = []
        for word in title_right:
            if word not in to_stop:
                temp_title_right.append(word)
        
        # Join the elements in the list to create the strings
        title_left = ' '.join(temp_title_left)
        title_right = ' '.join(temp_title_right)
        
        # Append the newly created row (title_left, title_right, label) to the new dataframe
        norm_computers = norm_computers.append(pd.DataFrame([[title_left, title_right, row.label]], columns=column_names))
        
    return norm_computers
        

In [144]:
def create_simple_data():
    """
    Creates and saves a simpler version of the original data that only contains the the two titles and the label.
    """
    
    # Get the dataset of computer parts
    computers_df = pd.read_json('data/computers_train/computers_train_xlarge_normalized.json.gz',compression='gzip', lines=True)
    norm_computers = preprocessing(computers_df)
    
    # Save the new normalized and simplified data to a CSV file to load later
    norm_computers.to_csv('data/computers_train/computers_train_xlarge_norm_simple.csv', index=False)

In [4]:
# Load the data
computer_df = pd.read_csv('data/computers_train/computers_train_xlarge_norm_simple.csv')

In [7]:
computer_df.head()

Unnamed: 0,title_one,title_two,label
0,hp intel xeon x5560 prijzen tweakers,495906 b21 hp x5560 2 80ghz ml350 g6,1
1,495906 b21 hp x5560 2 80ghz ml350 g6 new whole...,495906 b21 hp x5560 2 80ghz ml350 g6,1
2,asus motherboard lga2066 ddr4 2 u atx 2xgbe pr...,asus prime x299 deluxe,1
3,asus prime x299 deluxe prijzen tweakers,asus prime x299 deluxe socket 2066 intel atx m...,1
4,asus prime x299 deluxe,asus prime x299 deluxe socket lga2066 motherbo...,1


## Model Info

For the model, we are going to use LSTMs with a Constrastive Loss Function 
that will also be used to predict whether the two products are the same 

First, we have to convert the titles to embeddings through FastText before feeding into the LSTM.
The embedding part of this model will not be a layer because:
* The fasttext model would be time consuming and annoying to get to work with an embedding layer in Keras
* The fasttext model is not going to be getting its embeddings optimized, so there is really no point in adding it as an embedding layer