# Siamese neural network notebook example
To play with this notebook download the Quora questions pairs dataset on Kaggle and unzip it: https://www.kaggle.com/quora/question-pairs-dataset/data

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn.model_selection import train_test_split
from zeugma import GloVeTransformer

pd.set_option('display.max_colwidth', 200)

# Download and load embedding
GloVeTransformer.download_embeddings()
glove = GloVeTransformer(aggregation='sum')

df = pd.read_csv('questions.csv')
df_train, df_test = train_test_split(df)

## Create text preprocessing transformer

In [None]:
from sklearn.preprocessing import FunctionTransformer
import spacy

nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])


def lemmatize(text, n_tokens=10):
    """ Return the n_tokens first lemmatized tokens of an input text"""
    tokens = [
        token.lemma_ if token.lemma_ != '-PRON-' else token.lower_
        for token in nlp(text)
        if not (token.is_punct | token.is_space)
    ]
    return ' '.join(tokens[:n_tokens])


text_preprocessor = FunctionTransformer(
    lambda job_titles: [lemmatize(t) for t in job_titles],
    validate=False
)

## Build sklearn wrapper of the siamese network

In [None]:
from keras import Model, Sequential
from keras.layers import Input, Dense, Dropout, subtract, Lambda
from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier

def create_model(input_dim=300):
    """ Model creation function: returns a compiled Siamese Network"""
    concat_input = Input((2*input_dim,))

    left_input = Lambda(lambda x: x[:, :input_dim], output_shape=(input_dim,))(concat_input)
    right_input = Lambda(lambda x: x[:, input_dim:], output_shape=(input_dim,))(concat_input)

    seq = Sequential()
    seq.add(Dense(512, input_shape=(input_dim,), activation='relu'))
    seq.add(Dropout(0.2))
    seq.add(Dense(256, input_shape=(input_dim,), activation='relu'))
    seq.add(Dropout(0.2))
    seq.add(Dense(128, activation='relu'))

    encoded_l = seq(left_input)
    encoded_r = seq(right_input)

    subtracted = subtract([encoded_l, encoded_r])
    l1_distance = Lambda(abs)(subtracted)
    prediction = Dense(1, activation='sigmoid')(l1_distance)

    siamese_net = Model(inputs=concat_input, outputs=prediction)

    siamese_net.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
    
    return siamese_net


siamese_net = KerasClassifier(build_fn=create_model, epochs=5, batch_size=128, 
                              verbose=2, validation_split=0.2)

## Build preprocessing and model pipeline

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from zeugma import ItemSelector

pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
            ('question1', Pipeline([
                ('selector', ItemSelector('question1')),
                ('text_preprocessor', text_preprocessor),
                ('vectorizer', glove),
            ])),
            ('question2', Pipeline([
                ('selector', ItemSelector('question2')),  
                ('text_preprocessor', text_preprocessor),
                ('vectorizer', glove),
            ])),
        ],
    )),
    ('siamese_net', siamese_net),
])
pipeline.fit(df_train, df_train['target'])

y_prob = clf.predict_proba(X_test)[:, 1]
y_pred = np.round(y_prob)

## Results analysis

In [None]:
from sklearn import roc_auc_score
print(roc_auc_score(df_test['is_duplicate'])