In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import sparknlp # nlp processing
from sklearn.model_selection import train_test_split # splitting data

import matplotlib.pyplot as plt # visualisation
import seaborn as sns # visualisation 
%matplotlib inline



In [2]:
randomState = np.random.RandomState(seed=42) # for creating same randomness in each time

In [13]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.0
Apache Spark version:  2.4.4


In [14]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

df = spark.read.option("header", True).option("inferSchema", True).csv("dataset/clean_data.csv")
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- qid1: string (nullable = true)
 |-- qid2: string (nullable = true)
 |-- question1: string (nullable = true)
 |-- question2: string (nullable = true)
 |-- is_duplicate: string (nullable = true)



In [15]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

#### Preprocessing

In [125]:
def preprocessing_partial_pipeline(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")\
        .setCleanupMode("shrink") 
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")
    
    spell_checker = NorvigSweetingApproach() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_checked") \
        .setDictionary("./spell/coca2017.txt", "[a-zA-Z]+")
    
    normalizer = Normalizer() \
        .setInputCols([column+"_checked"]) \
        .setOutputCol(column+"_normalized")
    
    lemma = LemmatizerModel.pretrained('lemma_antbnc') \
        .setInputCols([column+"_normalized"]) \
        .setOutputCol(column+"_lemma")
    
    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_lemma")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)
   
    finisher = Finisher() \
        .setInputCols([column+"_cleanTokens"]) \
        .setOutputCols([column+"_finished"])\
        .setIncludeMetadata(False)\
        .setCleanAnnotations(True)
    
    return [document_assembler, sentence_detector, tokenizer, spell_checker, normalizer, lemma, stopwords_cleaner, finisher]

def preprocessing_pipeline():

    q1_stages = preprocessing_partial_pipeline("question1")
    
    q2_stages = preprocessing_partial_pipeline("question2")
     
    pipeline = Pipeline(stages=q1_stages+q2_stages)
    
    return pipeline

In [126]:
df_limited = df.limit(2000)

In [127]:
# Before Preprocessing 
df_limited.take(5)

[Row(id='0', qid1='1', qid2='2', question1='What is the step by step guide to invest in share market in india?', question2='What is the step by step guide to invest in share market?', is_duplicate='0'),
 Row(id='1', qid1='3', qid2='4', question1='What is the story of Kohinoor (Koh-i-Noor) Diamond?', question2='What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', is_duplicate='0'),
 Row(id='2', qid1='5', qid2='6', question1='How can I increase the speed of my internet connection while using a VPN?', question2='How can Internet speed be increased by hacking through DNS?', is_duplicate='0'),
 Row(id='3', qid1='7', qid2='8', question1='Why am I mentally very lonely? How can I solve it?', question2='Find the remainder when [math]23^{24}[/math] is divided by 24,23?', is_duplicate='0'),
 Row(id='4', qid1='9', qid2='10', question1='Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?', question2='Which fish would survive in salt wat

In [129]:
pre_pipeline = preprocessing_pipeline()
model = pre_pipeline.fit(df_limited)
df_processed = model.transform(df_limited).persist().select("question1", "question2", "question1_finished", "question2_finished", "is_duplicate")

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [130]:
df_processed.take(5)

[Row(question1='What is the step by step guide to invest in share market in india?', question2='What is the step by step guide to invest in share market?', question1_finished=['step', 'step', 'guide', 'invest', 'share', 'market', 'india'], question2_finished=['step', 'step', 'guide', 'invest', 'share', 'market'], is_duplicate='0'),
 Row(question1='What is the story of Kohinoor (Koh-i-Noor) Diamond?', question2='What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', question1_finished=['story', 'kohinoor', 'KohiNoor', 'diamond'], question2_finished=['happen', 'indian', 'government', 'steal', 'kohinoor', 'KohiNoor', 'diamond', 'back'], is_duplicate='0'),
 Row(question1='How can I increase the speed of my internet connection while using a VPN?', question2='How can Internet speed be increased by hacking through DNS?', question1_finished=['increase', 'speed', 'internet', 'connection', 'use', 'VPN'], question2_finished=['internet', 'speed', 'increase', 'ha

### Preprocessing + Bert Embedding

In [131]:
def bert_partial(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")\
        .setCleanupMode("shrink") 
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")
    
    spell_checker = NorvigSweetingApproach() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_checked") \
        .setDictionary("./spell/coca2017.txt", "[a-zA-Z]+")
    
    normalizer = Normalizer() \
        .setInputCols([column+"_checked"]) \
        .setOutputCol(column+"_normalized")
    
    lemma = LemmatizerModel.pretrained('lemma_antbnc') \
        .setInputCols([column+"_normalized"]) \
        .setOutputCol(column+"_lemma")
   
    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_lemma")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)
    
    bert_embeddings = BertEmbeddings\
     .pretrained('bert_base_cased', 'en') \
     .setInputCols([column+"_document",column+"_cleanTokens"])\
     .setOutputCol(column+"_bert")\
     .setCaseSensitive(False)\
     .setPoolingLayer(0)

    embeddingsSentence = SentenceEmbeddings() \
          .setInputCols([column+"_document", column+"_bert"]) \
          .setOutputCol(column+"_sentence_embeddings") \
          .setPoolingStrategy("AVERAGE")

    embeddings_finisher = EmbeddingsFinisher() \
        .setInputCols([column+"_sentence_embeddings"]) \
        .setOutputCols([column+"_finished_sentence_embeddings"]) \
        .setOutputAsVector(True)\
        .setCleanAnnotations(True)

    return [document_assembler, sentence_detector, tokenizer, spell_checker, normalizer, lemma, \
            stopwords_cleaner, bert_embeddings, embeddingsSentence, embeddings_finisher]

def bert_pipeline():
     
    q1_stages = bert_partial("question1")
    
    q2_stages = bert_partial("question2")
    
    label_stringIdx = StringIndexer(inputCol = "is_duplicate", outputCol = "label")
    
    pipeline = Pipeline(stages=q1_stages+q2_stages+[label_stringIdx])
    
    return pipeline

In [132]:
nlp_pipeline_bert = bert_pipeline()

nlp_model_bert = nlp_pipeline_bert.fit(df_limited)

processed_bert = nlp_model_bert.transform(df_limited)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [133]:
processed_bert.show(5)

+---+----+----+--------------------+--------------------+------------+--------------------------------------+--------------------------------------+-----+
| id|qid1|qid2|           question1|           question2|is_duplicate|question1_finished_sentence_embeddings|question2_finished_sentence_embeddings|label|
+---+----+----+--------------------+--------------------+------------+--------------------------------------+--------------------------------------+-----+
|  0|   1|   2|What is the step ...|What is the step ...|           0|                  [[0.1943234652280...|                  [[0.2024581581354...|  0.0|
|  1|   3|   4|What is the story...|What would happen...|           0|                  [[-0.479584932327...|                  [[0.0168425478041...|  0.0|
|  2|   5|   6|How can I increas...|How can Internet ...|           0|                  [[0.3350400924682...|                  [[-0.360031187534...|  0.0|
|  3|   7|   8|Why am I mentally...|Find the remainde...|           0|

In [134]:
from pyspark.sql.functions import explode

processed_bert2 = processed_bert.withColumn("q1_features", explode(processed_bert.question1_finished_sentence_embeddings))
processed_bert3 = processed_bert2.withColumn("q2_features", explode(processed_bert2.question2_finished_sentence_embeddings))

bert_final = processed_bert3.select('question1','question2','q1_features','q2_features','label')

In [135]:
bert_final.show(5)

+--------------------+--------------------+--------------------+--------------------+-----+
|           question1|           question2|         q1_features|         q2_features|label|
+--------------------+--------------------+--------------------+--------------------+-----+
|What is the step ...|What is the step ...|[0.19432346522808...|[0.20245815813541...|  0.0|
|What is the story...|What would happen...|[-0.4795849323272...|[0.01684254780411...|  0.0|
|How can I increas...|How can Internet ...|[0.33504009246826...|[-0.3600311875343...|  0.0|
|Why am I mentally...|Find the remainde...|[0.08551444858312...|[-0.6504728794097...|  0.0|
|Which one dissolv...|Which fish would ...|[0.15526680648326...|[-0.5414510369300...|  0.0|
+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows

