In [1]:
import sparknlp
spark=sparknlp.start()

In [2]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd

In [3]:
query="SELECT * FROM train_1_csv"
traindata=spark.sql(query)

In [4]:
traindata.show(5)

In [5]:
traindata=traindata.na.drop()

In [6]:
traindata.select("text").show(5)

In [7]:
traindata.select("target\r").show(5)

In [8]:
#only train and target column are useful for analysis

In [9]:
#changing name of target column
traindata=traindata.withColumnRenamed('target\r', 'target')

In [10]:
#removing any type of whitespace from target column
from pyspark.sql.functions import *
traindata = traindata.withColumn('target', regexp_replace('target', '\r', ''))

In [11]:
from pyspark.sql.functions import col
traindata.groupBy("target") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [12]:
traindata.columns

In [13]:
traindata=traindata.drop('id', 'keyword', 'location')

In [14]:
traindata.show(5)

In [15]:
#creating nlp pipeline using spark-nlp

In [16]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")


In [17]:
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

In [18]:
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

In [19]:
normalizer = Normalizer()\
    .setInputCols(["token"])\
    .setOutputCol("normalized")

In [20]:
stopwords_cleaner=StopWordsCleaner()\
  .setInputCols("normalized")\
  .setOutputCol("cleanTokens")\
  .setCaseSensitive(False)

In [21]:
lemma=LemmatizerModel.pretrained('lemma_antbnc')\
  .setInputCols(["cleanTokens"])\
  .setOutputCol("lemma")

In [22]:
word_embeddings=WordEmbeddingsModel().pretrained()\
  .setInputCols(["document","lemma"])\
  .setOutputCol("embeddings")\
  .setCaseSensitive(False)

In [23]:
embeddingsSentence=SentenceEmbeddings()\
  .setInputCols(["document","embeddings"])\
  .setOutputCol("sentence_embeddings")\
  .setPoolingStrategy("AVERAGE")

In [24]:
classifierdl=ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("target")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

In [25]:
clf_pipeline=Pipeline(
    stages=[document,
            tokenizer,
            normalizer,
            stopwords_cleaner,
            lemma,
            word_embeddings,
            embeddingsSentence,
            classifierdl])

In [26]:
clf_pipelineModel=clf_pipeline.fit(traindata)

In [27]:
df=clf_pipelineModel.transform(traindata)

In [28]:
result=df.select("class.result")

In [29]:
result.show()

In [30]:
clf_pipelineModel.stages

In [31]:
from sklearn.metrics import classification_report,accuracy_score
df=clf_pipelineModel.transform(traindata).select('text','target',"class.result").toPandas()
df['result']=df['result'].apply(lambda x:x[0])
print(classification_report(df.target,df.result))
print(accuracy_score(df.target,df.result))

In [32]:
###lets use UniversalSentenseEncoder

In [33]:
use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

In [34]:
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("target")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

In [35]:
use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [36]:
use_pipelineModel = use_clf_pipeline.fit(traindata)

In [37]:
from sklearn.metrics import classification_report,accuracy_score
df=clf_pipelineModel.transform(traindata).select('text','target',"class.result").toPandas()
df['result']=df['result'].apply(lambda x:x[0])
print(classification_report(df.target,df.result))
print(accuracy_score(df.target,df.result))