In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import sparknlp # nlp processing
from sklearn.model_selection import train_test_split # splitting data

import matplotlib.pyplot as plt # visualisation
import seaborn as sns # visualisation 
%matplotlib inline

In [2]:
randomState = np.random.RandomState(seed=42) # for creating same randomness in each time

In [3]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.0
Apache Spark version:  2.4.5


In [4]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [5]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

In [6]:
def bert_partial(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")\
        .setCleanupMode("shrink") 
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")
    
    spell_checker = NorvigSweetingApproach() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_checked") \
        .setDictionary("./spell/coca2017.txt", "[a-zA-Z]+")
    
    normalizer = Normalizer() \
        .setInputCols([column+"_checked"]) \
        .setOutputCol(column+"_normalized")
    
    lemma = LemmatizerModel.pretrained('lemma_antbnc') \
        .setInputCols([column+"_normalized"]) \
        .setOutputCol(column+"_lemma")
   
    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_lemma")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)
    
    bert_embeddings = BertEmbeddings\
        .pretrained('bert_base_cased', 'en') \
        .setInputCols([column+"_document",column+"_cleanTokens"])\
        .setOutputCol(column+"_bert")\
        .setCaseSensitive(False)\
        .setPoolingLayer(0)

    embeddingsSentence = SentenceEmbeddings() \
          .setInputCols([column+"_document", column+"_bert"]) \
          .setOutputCol(column+"_sentence_embeddings") \
          .setPoolingStrategy("AVERAGE")

    embeddings_finisher = EmbeddingsFinisher() \
        .setInputCols([column+"_sentence_embeddings"]) \
        .setOutputCols([column+"_finished_sentence_embeddings"]) \
        .setOutputAsVector(True)\
        .setCleanAnnotations(True)

    return [document_assembler, sentence_detector, tokenizer, spell_checker, normalizer, lemma, \
            stopwords_cleaner, bert_embeddings, embeddingsSentence, embeddings_finisher]

In [7]:
def bert_pipeline():
    
    q1_stages = bert_partial("question1")
    
    q2_stages = bert_partial("question2")
    
    label_stringIdx = StringIndexer(inputCol = "is_duplicate", outputCol = "label")
    
    pipeline = Pipeline(stages=q1_stages+q2_stages+[label_stringIdx])
    
    return pipeline

In [8]:
nlp_pipeline_bert = bert_pipeline()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [9]:
data = pd.read_csv("dataset/clean_data.csv")
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [13]:
for ix, df_part in enumerate(np.array_split(data, 202)):
    chunk = sql.createDataFrame(df_part)
    nlp_model_bert = nlp_pipeline_bert.fit(chunk)
    
    result = nlp_model_bert.transform(chunk)
    
    if ix == 0:
        result_df = result
    else:
        result_df.unionAll(result)
    if ix == 2:
        break

In [16]:
result_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- qid1: long (nullable = true)
 |-- qid2: long (nullable = true)
 |-- question1: string (nullable = true)
 |-- question2: string (nullable = true)
 |-- is_duplicate: long (nullable = true)
 |-- question1_finished_sentence_embeddings: array (nullable = true)
 |    |-- element: vector (containsNull = true)
 |-- question2_finished_sentence_embeddings: array (nullable = true)
 |    |-- element: vector (containsNull = true)
 |-- label: double (nullable = false)



In [18]:
result_df.drop("is_duplicate").show(5)

+---+----+----+--------------------+--------------------+--------------------------------------+--------------------------------------+-----+
| id|qid1|qid2|           question1|           question2|question1_finished_sentence_embeddings|question2_finished_sentence_embeddings|label|
+---+----+----+--------------------+--------------------+--------------------------------------+--------------------------------------+-----+
|  0|   1|   2|What is the step ...|What is the step ...|                  [[0.1943234652280...|                  [[0.2024581581354...|  0.0|
|  1|   3|   4|What is the story...|What would happen...|                  [[-0.479584932327...|                  [[0.0168425478041...|  0.0|
|  2|   5|   6|How can I increas...|How can Internet ...|                  [[0.3350400924682...|                  [[-0.360031187534...|  0.0|
|  3|   7|   8|Why am I mentally...|Find the remainde...|                  [[0.0855144485831...|                  [[-0.650472879409...|  0.0|
|  4| 

In [19]:
result_df.drop("is_duplicate").write.save("questionBertEmbeddings.parquet")