In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing
import sparknlp # nlp processing
from sklearn.model_selection import train_test_split # splitting data

import matplotlib.pyplot as plt # visualisation
import seaborn as sns # visualisation 
%matplotlib inline

In [3]:
randomState = np.random.RandomState(seed=42) # for creating same randomness in each time

In [4]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.0
Apache Spark version:  2.4.5


In [5]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

df = spark.read.option("header", True).option("inferSchema", True).csv("dataset/clean_data.csv")
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- qid1: string (nullable = true)
 |-- qid2: string (nullable = true)
 |-- question1: string (nullable = true)
 |-- question2: string (nullable = true)
 |-- is_duplicate: string (nullable = true)



In [15]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

### Preprocessing

In [125]:
def preprocessing_partial_pipeline(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")\
        .setCleanupMode("shrink") 
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")
    
    spell_checker = NorvigSweetingApproach() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_checked") \
        .setDictionary("./spell/coca2017.txt", "[a-zA-Z]+")
    
    normalizer = Normalizer() \
        .setInputCols([column+"_checked"]) \
        .setOutputCol(column+"_normalized")
    
    lemma = LemmatizerModel.pretrained('lemma_antbnc') \
        .setInputCols([column+"_normalized"]) \
        .setOutputCol(column+"_lemma")
    
    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_lemma")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)
   
    finisher = Finisher() \
        .setInputCols([column+"_cleanTokens"]) \
        .setOutputCols([column+"_finished"])\
        .setIncludeMetadata(False)\
        .setCleanAnnotations(True)
    
    return [document_assembler, sentence_detector, tokenizer, spell_checker, normalizer, lemma, stopwords_cleaner, finisher]

def preprocessing_pipeline():

    q1_stages = preprocessing_partial_pipeline("question1")
    
    q2_stages = preprocessing_partial_pipeline("question2")
     
    pipeline = Pipeline(stages=q1_stages+q2_stages)
    
    return pipeline

In [126]:
df_limited = df.limit(2000)

In [127]:
# Before Preprocessing 
df_limited.take(5)

[Row(id='0', qid1='1', qid2='2', question1='What is the step by step guide to invest in share market in india?', question2='What is the step by step guide to invest in share market?', is_duplicate='0'),
 Row(id='1', qid1='3', qid2='4', question1='What is the story of Kohinoor (Koh-i-Noor) Diamond?', question2='What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', is_duplicate='0'),
 Row(id='2', qid1='5', qid2='6', question1='How can I increase the speed of my internet connection while using a VPN?', question2='How can Internet speed be increased by hacking through DNS?', is_duplicate='0'),
 Row(id='3', qid1='7', qid2='8', question1='Why am I mentally very lonely? How can I solve it?', question2='Find the remainder when [math]23^{24}[/math] is divided by 24,23?', is_duplicate='0'),
 Row(id='4', qid1='9', qid2='10', question1='Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?', question2='Which fish would survive in salt wat

In [129]:
pre_pipeline = preprocessing_pipeline()
model = pre_pipeline.fit(df_limited)
df_processed = model.transform(df_limited).persist().select("question1", "question2", "question1_finished", "question2_finished", "is_duplicate")

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [130]:
df_processed.take(5)

[Row(question1='What is the step by step guide to invest in share market in india?', question2='What is the step by step guide to invest in share market?', question1_finished=['step', 'step', 'guide', 'invest', 'share', 'market', 'india'], question2_finished=['step', 'step', 'guide', 'invest', 'share', 'market'], is_duplicate='0'),
 Row(question1='What is the story of Kohinoor (Koh-i-Noor) Diamond?', question2='What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', question1_finished=['story', 'kohinoor', 'KohiNoor', 'diamond'], question2_finished=['happen', 'indian', 'government', 'steal', 'kohinoor', 'KohiNoor', 'diamond', 'back'], is_duplicate='0'),
 Row(question1='How can I increase the speed of my internet connection while using a VPN?', question2='How can Internet speed be increased by hacking through DNS?', question1_finished=['increase', 'speed', 'internet', 'connection', 'use', 'VPN'], question2_finished=['internet', 'speed', 'increase', 'ha

### Preprocessing + Bert Embedding

In [153]:
def bert_partial(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")\
        .setCleanupMode("shrink") 
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")
    
    spell_checker = NorvigSweetingApproach() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_checked") \
        .setDictionary("./spell/coca2017.txt", "[a-zA-Z]+")
    
    normalizer = Normalizer() \
        .setInputCols([column+"_checked"]) \
        .setOutputCol(column+"_normalized")
    
    lemma = LemmatizerModel.pretrained('lemma_antbnc') \
        .setInputCols([column+"_normalized"]) \
        .setOutputCol(column+"_lemma")
   
    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_lemma")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)
    
    bert_embeddings = BertEmbeddings\
        .pretrained('bert_base_cased', 'en') \
        .setInputCols([column+"_document",column+"_cleanTokens"])\
        .setOutputCol(column+"_bert")\
        .setCaseSensitive(False)\
        .setPoolingLayer(0)

    embeddingsSentence = SentenceEmbeddings() \
          .setInputCols([column+"_document", column+"_bert"]) \
          .setOutputCol(column+"_sentence_embeddings") \
          .setPoolingStrategy("AVERAGE")

    embeddings_finisher = EmbeddingsFinisher() \
        .setInputCols([column+"_sentence_embeddings"]) \
        .setOutputCols([column+"_finished_sentence_embeddings"]) \
        .setOutputAsVector(True)\
        .setCleanAnnotations(False)

    return [document_assembler, sentence_detector, tokenizer, spell_checker, normalizer, lemma, \
            stopwords_cleaner, bert_embeddings, embeddingsSentence, embeddings_finisher]

def bert_pipeline():
     
    q1_stages = bert_partial("question1")
    
    q2_stages = bert_partial("question2")
    
    label_stringIdx = StringIndexer(inputCol = "is_duplicate", outputCol = "label")
    
    classsifierdl = ClassifierDLApproach()\
        .setInputCols(["question1_sentence_embeddings","question2_sentence_embeddings"])\
        .setOutputCol("class")\
        .setLabelColumn("label")\
        .setMaxEpochs(15)\
        .setEnableOutputLogs(True)
    
    pipeline = Pipeline(stages=q1_stages+q2_stages+[label_stringIdx, classsifierdl])
    
    return pipeline

In [154]:
nlp_pipeline_bert = bert_pipeline()

nlp_model_bert = nlp_pipeline_bert.fit(df_limited)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [155]:
from sklearn.metrics import classification_report, accuracy_score

df = nlp_model_bert.transform(df_limited).select('label', 'question1', 'question2', 'class.result').toPandas()

In [None]:
#processed_bert = nlp_model_bert.transform(df_limited)

In [157]:
df['result'] = df['result'].apply(lambda x: x[0])

In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   label      2000 non-null   float64
 1   question1  2000 non-null   object 
 2   question2  2000 non-null   object 
 3   result     2000 non-null   object 
dtypes: float64(1), object(3)
memory usage: 62.6+ KB


In [161]:
df['result'] = df['result'].apply(lambda x: float(x))

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   label      2000 non-null   float64
 1   question1  2000 non-null   object 
 2   question2  2000 non-null   object 
 3   result     2000 non-null   float64
dtypes: float64(2), object(2)
memory usage: 62.6+ KB


In [163]:
df.head()

Unnamed: 0,label,question1,question2,result
0,0.0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.0
1,0.0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.0
2,0.0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0
3,0.0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0
4,0.0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0


In [164]:
print(classification_report(df.label, df.result))
print(accuracy_score(df.label, df.result))

              precision    recall  f1-score   support

         0.0       0.62      1.00      0.77      1250
         1.0       0.00      0.00      0.00       741
         2.0       0.00      0.00      0.00         1
         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         1
         5.0       0.00      0.00      0.00         1
         6.0       0.00      0.00      0.00         1
         7.0       0.00      0.00      0.00         1
         8.0       0.00      0.00      0.00         1
         9.0       0.00      0.00      0.00         1
        10.0       0.00      0.00      0.00         1

    accuracy                           0.62      2000
   macro avg       0.06      0.09      0.07      2000
weighted avg       0.39      0.62      0.48      2000

0.625


  _warn_prf(average, modifier, msg_start, len(result))


In [169]:
df[df.label == 10]

Unnamed: 0,label,question1,question2,result
813,10.0,"""Is there a """"blind trust"""" provision for Amer...","and how is it enforced?""",0.0
