In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import sparknlp # nlp processing
from sklearn.model_selection import train_test_split # splitting data

import matplotlib.pyplot as plt # visualisation
import seaborn as sns # visualisation 
%matplotlib inline

In [2]:
randomState = np.random.RandomState(seed=42) # for creating same randomness in each time

In [3]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.0
Apache Spark version:  2.4.5


In [4]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [5]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

In [5]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

df = spark.read.option("header", True).option("inferSchema", True).csv("dataset/clean_data.csv")
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- qid1: string (nullable = true)
 |-- qid2: string (nullable = true)
 |-- question1: string (nullable = true)
 |-- question2: string (nullable = true)
 |-- is_duplicate: string (nullable = true)



In [8]:
df.show(5)

+---+----+----+--------------------+--------------------+------------+
| id|qid1|qid2|           question1|           question2|is_duplicate|
+---+----+----+--------------------+--------------------+------------+
|  0|   1|   2|What is the step ...|What is the step ...|           0|
|  1|   3|   4|What is the story...|What would happen...|           0|
|  2|   5|   6|How can I increas...|How can Internet ...|           0|
|  3|   7|   8|Why am I mentally...|Find the remainde...|           0|
|  4|   9|  10|Which one dissolv...|Which fish would ...|           0|
+---+----+----+--------------------+--------------------+------------+
only showing top 5 rows



In [6]:
def bert_partial(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")\
        .setCleanupMode("shrink") 
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")
    
    spell_checker = NorvigSweetingApproach() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_checked") \
        .setDictionary("./spell/coca2017.txt", "[a-zA-Z]+")
    
    normalizer = Normalizer() \
        .setInputCols([column+"_checked"]) \
        .setOutputCol(column+"_normalized")
    
    lemma = LemmatizerModel.pretrained('lemma_antbnc') \
        .setInputCols([column+"_normalized"]) \
        .setOutputCol(column+"_lemma")
   
    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_lemma")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)
    
    bert_embeddings = BertEmbeddings\
        .pretrained('bert_base_cased', 'en') \
        .setInputCols([column+"_document",column+"_cleanTokens"])\
        .setOutputCol(column+"_bert")\
        .setCaseSensitive(False)\
        .setPoolingLayer(0)

    embeddingsSentence = SentenceEmbeddings() \
          .setInputCols([column+"_document", column+"_bert"]) \
          .setOutputCol(column+"_sentence_embeddings") \
          .setPoolingStrategy("AVERAGE")

    embeddings_finisher = EmbeddingsFinisher() \
        .setInputCols([column+"_sentence_embeddings"]) \
        .setOutputCols([column+"_finished_sentence_embeddings"]) \
        .setOutputAsVector(True)\
        .setCleanAnnotations(True)

    return [document_assembler, sentence_detector, tokenizer, spell_checker, normalizer, lemma, \
            stopwords_cleaner, bert_embeddings, embeddingsSentence, embeddings_finisher]

In [7]:
def bert_pipeline():
    
    q1_stages = bert_partial("question1")
    
    q2_stages = bert_partial("question2")
    
    #label_stringIdx = StringIndexer(inputCol = "is_duplicate", outputCol = "label")
    
    pipeline = Pipeline(stages=q1_stages+q2_stages)
    
    return pipeline

In [21]:
df_limited = df.limit(2000)

In [33]:
nlp_pipeline_bert = bert_pipeline()

nlp_model_bert = nlp_pipeline_bert.fit(df_limited)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


TypeError: fit() got an unexpected keyword argument 'verbose'

In [23]:
result = nlp_model_bert.transform(df_limited)

In [31]:
res_df = result.select("id", "qid1", "qid2", "question1_finished_sentence_embeddings", "question2_finished_sentence_embeddings", "is_duplicate").toPandas()

In [32]:
res_df.head()

Unnamed: 0,id,qid1,qid2,question1_finished_sentence_embeddings,question2_finished_sentence_embeddings,is_duplicate
0,0,1,2,"[[0.19432346522808075, 0.49967044591903687, -0...","[[0.20245815813541412, 0.5602966547012329, -0....",0
1,1,3,4,"[[-0.4795849323272705, 0.6140735745429993, -0....","[[0.016842547804117203, 0.36806219816207886, -...",0
2,2,5,6,"[[0.3350400924682617, 0.2731911242008209, 0.21...","[[-0.3600311875343323, 0.07215926796197891, 0....",0
3,3,7,8,"[[0.08551444858312607, 0.016606474295258522, -...","[[-0.65047287940979, -0.283554345369339, -0.37...",0
4,4,9,10,"[[0.15526680648326874, 0.3027419447898865, 0.1...","[[-0.5414510369300842, 0.25272536277770996, -0...",0


In [36]:
chunks = df.repartition(202145)

In [None]:
result_df = pd.DataFrame(columns=["id", "qid1", "qid2", "question1_finished_sentence_embeddings", "question2_finished_sentence_embeddings", "is_duplicate"])

for ix, chunk in enumerate(chunks):
    if ix == 2:
        break
    nlp_model_bert = nlp_pipeline_bert.fit(chunk)
    result = nlp_model_bert.transform(chunk)
    result_df = pd.concat([result_df, result], sort=False)

In [8]:
data = pd.read_csv("dataset/clean_data.csv")
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404287 entries, 0 to 404286
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404287 non-null  int64 
 1   qid1          404287 non-null  int64 
 2   qid2          404287 non-null  int64 
 3   question1     404287 non-null  object
 4   question2     404287 non-null  object
 5   is_duplicate  404287 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [12]:
nlp_pipeline_bert = bert_pipeline()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [13]:
for ix, df_part in enumerate(np.array_split(data, 202)):
    chunk = sql.createDataFrame(df_part)
    nlp_model_bert = nlp_pipeline_bert.fit(chunk)
    
    result = nlp_model_bert.transform(chunk)
    res_df = result.select("id", "qid1", "qid2", "question1_finished_sentence_embeddings", "question2_finished_sentence_embeddings", "is_duplicate").toPandas()
    if ix == 0:
        result_df = res_df
    else:
        result_df = pd.concat([result_df, res_df], sort=False).reset_index(drop=True)
        
    if ix == 2:
        break

In [14]:
result_df.head()

Unnamed: 0,id,qid1,qid2,question1_finished_sentence_embeddings,question2_finished_sentence_embeddings,is_duplicate
0,0,1,2,"[[0.19432346522808075, 0.49967044591903687, -0...","[[0.20245815813541412, 0.5602966547012329, -0....",0
1,1,3,4,"[[-0.4795849323272705, 0.6140735745429993, -0....","[[0.016842547804117203, 0.36806219816207886, -...",0
2,2,5,6,"[[0.3350400924682617, 0.2731911242008209, 0.21...","[[-0.3600311875343323, 0.07215926796197891, 0....",0
3,3,7,8,"[[0.08551444858312607, 0.016606474295258522, -...","[[-0.65047287940979, -0.283554345369339, -0.37...",0
4,4,9,10,"[[0.15526680648326874, 0.3027419447898865, 0.1...","[[-0.5414510369300842, 0.25272536277770996, -0...",0


In [15]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6006 entries, 0 to 6005
Data columns (total 6 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   id                                      6006 non-null   int64 
 1   qid1                                    6006 non-null   int64 
 2   qid2                                    6006 non-null   int64 
 3   question1_finished_sentence_embeddings  6006 non-null   object
 4   question2_finished_sentence_embeddings  6006 non-null   object
 5   is_duplicate                            6006 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 281.7+ KB


In [None]:
#result_df.to_csv("dataset/embeddings_data_string.csv", index=False)

In [19]:
result.printSchema()

root
 |-- id: long (nullable = true)
 |-- qid1: long (nullable = true)
 |-- qid2: long (nullable = true)
 |-- question1: string (nullable = true)
 |-- question2: string (nullable = true)
 |-- is_duplicate: long (nullable = true)
 |-- question1_finished_sentence_embeddings: array (nullable = true)
 |    |-- element: vector (containsNull = true)
 |-- question2_finished_sentence_embeddings: array (nullable = true)
 |    |-- element: vector (containsNull = true)

