In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import sparknlp # nlp processing
from sklearn.model_selection import train_test_split # splitting data

import matplotlib.pyplot as plt # visualisation
import seaborn as sns # visualisation 
%matplotlib inline



In [2]:
randomState = np.random.RandomState(seed=42) # for creating same randomness in each time

In [3]:
data = pd.read_csv("dataset/train.csv")
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [5]:
data[data.question1.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [6]:
data[((data['qid1']==493340) | (data['qid2']==493340))]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [7]:
data[data.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [8]:
data[((data['qid1']==174364) | (data['qid2']==174364))]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [12]:
data = data.dropna().reset_index(drop=True)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404287 entries, 0 to 404286
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404287 non-null  int64 
 1   qid1          404287 non-null  int64 
 2   qid2          404287 non-null  int64 
 3   question1     404287 non-null  object
 4   question2     404287 non-null  object
 5   is_duplicate  404287 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [14]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [16]:
# data.to_csv("dataset/clean_data.csv", index=False)

In [18]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.0
Apache Spark version:  2.4.4


In [167]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

df = spark.read.option("header", True).option("inferSchema", True).csv("dataset/clean_data.csv")
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- qid1: string (nullable = true)
 |-- qid2: string (nullable = true)
 |-- question1: string (nullable = true)
 |-- question2: string (nullable = true)
 |-- is_duplicate: string (nullable = true)



In [168]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [169]:
def glove_partial_pipeline(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")

    normalizer = Normalizer() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_normalized")

    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_normalized")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)

    glove_embeddings = WordEmbeddingsModel().pretrained() \
        .setInputCols([column+"_document",column+'_cleanTokens'])\
        .setOutputCol(column+"_embeddings")\
        .setCaseSensitive(False)

    embeddingsSentence = SentenceEmbeddings() \
        .setInputCols([column+"_document", column+"_embeddings"]) \
        .setOutputCol(column+"_sentence_embeddings") \
        .setPoolingStrategy("AVERAGE")

    return [document_assembler, sentence_detector, tokenizer, normalizer, stopwords_cleaner, glove_embeddings, embeddingsSentence]

def glove_pipeline():

    q1_stages = glove_partial_pipeline("question1")
    
    q2_stages = glove_partial_pipeline("question2")
     
    glove_pipeline = Pipeline(stages=q1_stages+q2_stages)
    
    return glove_pipeline

In [170]:
df = df.limit(2000)

In [171]:
glove_pipeline = glove_pipeline()
glove_nlp_model = glove_pipeline.fit(df)
glove_processed = glove_nlp_model.transform(df).persist()

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [185]:
pddf = glove_processed.select("question1_sentence_embeddings", "question2_sentence_embeddings", "is_duplicate").toPandas()
pddf.head()

Unnamed: 0,question1_sentence_embeddings,question2_sentence_embeddings,is_duplicate
0,"[(sentence_embeddings, 0, 65, What is the step...","[(sentence_embeddings, 0, 56, What is the step...",0
1,"[(sentence_embeddings, 0, 50, What is the stor...","[(sentence_embeddings, 0, 87, What would happe...",0
2,"[(sentence_embeddings, 0, 72, How can I increa...","[(sentence_embeddings, 0, 58, How can Internet...",0
3,"[(sentence_embeddings, 0, 49, Why am I mentall...","[(sentence_embeddings, 0, 64, Find the remaind...",0
4,"[(sentence_embeddings, 0, 75, Which one dissol...","[(sentence_embeddings, 0, 38, Which fish would...",0


AttributeError: 'list' object has no attribute 'asDict'

In [172]:
# set seed for reproducibility
(trainingData, testData) = glove_processed.randomSplit([0.7, 0.3], seed=100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 1400
Test Dataset Count: 600


In [173]:
label_stringIdx = StringIndexer(inputCol = "is_duplicate", outputCol = "label")
assembler = VectorAssembler(inputCols=["question1_sentence_embeddings", "question2_sentence_embeddings"], outputCol='features')
feature_pipeline = Pipeline(stages=[label_stringIdx, assembler])

In [None]:
feature_model = feature_pipeline.fit(trainingData)

train_featurized = feature_model.transform(trainingData).persist()

In [176]:
import tensorflow as tf
from tensorflow import keras

In [177]:
tf.__version__

'2.2.0'

In [178]:
keras.__version__

'2.3.0-tf'

In [None]:
from pyspark.ml.classification import LogisticRegression

trainingData = trainingData.select("question1_finished_sentence_embeddings", "question2_finished_sentence_embeddings")

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

#predictions = lrModel.transform(testData)

In [163]:
def partial_pipeline(column):
    document_assembler = DocumentAssembler() \
        .setInputCol(column) \
        .setOutputCol(column+"_document")
    
    sentence_detector = SentenceDetector() \
        .setInputCols([column+"_document"]) \
        .setOutputCol(column+"_sentence") \
        .setUseAbbreviations(True)
    
    tokenizer = Tokenizer() \
        .setInputCols([column+"_sentence"]) \
        .setOutputCol(column+"_token")

    normalizer = Normalizer() \
        .setInputCols([column+"_token"]) \
        .setOutputCol(column+"_normalized")

    stopwords_cleaner = StopWordsCleaner()\
        .setInputCols(column+"_normalized")\
        .setOutputCol(column+"_cleanTokens")\
        .setCaseSensitive(False)

    glove_embeddings = WordEmbeddingsModel().pretrained() \
        .setInputCols([column+"_document",column+'_cleanTokens'])\
        .setOutputCol(column+"_embeddings")\
        .setCaseSensitive(False)

    embeddingsSentence = SentenceEmbeddings() \
        .setInputCols([column+"_document", column+"_embeddings"]) \
        .setOutputCol(column+"_sentence_embeddings") \
        .setPoolingStrategy("AVERAGE")

    return [document_assembler, sentence_detector, tokenizer, normalizer, stopwords_cleaner, glove_embeddings, embeddingsSentence]

def classifierDL_pipeline():

    q1_stages = glove_partial_pipeline("question1")
    
    q2_stages = glove_partial_pipeline("question2")
    
    classsifierdl = ClassifierDLApproach()\
        .setInputCols(["question1_sentence_embeddings", "question2_sentence_embeddings"])\
        .setOutputCol("class")\
        .setLabelColumn("is_duplicate")\
        .setMaxEpochs(10)\
        .setEnableOutputLogs(True)

    pipeline = Pipeline(stages=q1_stages+q2_stages+[classsifierdl])
    
    return pipeline

In [164]:
classsifierdl_pipeline = classifierDL_pipeline()
classsifierdl_nlp_model = classsifierdl_pipeline.fit(df)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [12]:
import numpy as np

import pandas as pd
arr = "[[2,3,4],[5,6,7]]"

In [10]:
np.fromstring(arr)

  """Entry point for launching an IPython kernel.


ValueError: string size must be a multiple of element size

In [13]:
result_df = pd.DataFrame(columns=["id", "qid1", "qid2", "question1_finished_sentence_embeddings", "question2_finished_sentence_embeddings", "is_duplicate"])

In [15]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 6 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   id                                      0 non-null      object
 1   qid1                                    0 non-null      object
 2   qid2                                    0 non-null      object
 3   question1_finished_sentence_embeddings  0 non-null      object
 4   question2_finished_sentence_embeddings  0 non-null      object
 5   is_duplicate                            0 non-null      object
dtypes: object(6)
memory usage: 0.0+ bytes
