In [42]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.5

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)


Importing Packages

In [43]:
import sparknlp 
spark= sparknlp.start()
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

from pyspark.sql import functions as f
from pyspark.ml import Pipeline

import pandas as pd 
import numpy as np

Load Dataset

In [44]:
test_df= spark.read.text("./test_set_tweets.txt").toDF("text")
test_df.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|22077441	10538487904	Ok today I have to find something to wear for fri cuz I don't think I have time any other day this week.. I'm thinking all black and pearls!	2010-03-15 17:35:58|
|22077441	10536835844	I am glad I'm having this show but I can't wait till it is over so I can rest and stop worrying !!	2010-03-15 16:53:44                                          |
|22077441	10536809086	Honestly I don't even know what's going on anymore	2010-03

In [45]:
test_df.count()

2422182

### Creating Ner Model. 
I use ner_dl because I want to get organization and person.  

In [46]:
document= DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")\

sentence= SentenceDetector()\
.setInputCols(["document"])\
.setOutputCol("sentence")

tokenizer= Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")

normalizer= Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized")

embedding= WordEmbeddingsModel.pretrained("glove_100d")\
.setInputCols(["sentence","normalized"])\
.setOutputCol("embedded")

ner= NerDLModel.pretrained("ner_dl", lang="en")\
.setInputCols(["sentence", "token", "embedded"])\
.setOutputCol("ner")

converter= NerConverter()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_convert")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [47]:
nlpPipeline= Pipeline(stages=[
                           document,
                           sentence,
                           tokenizer,
                           normalizer,
                           embedding,
                           ner,
                           converter
])

model= nlpPipeline.fit(test_df)
result= model.transform(test_df)

In [48]:
result.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [49]:
result.select("ner_convert.result").show(50, truncate=False)

+---------------------------------------------------------+
|result                                                   |
+---------------------------------------------------------+
|[]                                                       |
|[I'm]                                                    |
|[]                                                       |
|[I'm, @Iam_MarkyMark]                                    |
|[]                                                       |
|[]                                                       |
|[I'm, I'm, Lol]                                          |
|[@Iam_MarkyMark, aren't]                                 |
|[]                                                       |
|[Lol]                                                    |
|[Hmmm]                                                   |
|[]                                                       |
|[I'm]                                                    |
|[]                                     

In [50]:
result.select("sentence.result").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                     |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[22077441	10538487904	Ok today I have to find something to wear for fri cuz I don't think I have time any other day this week., ., I'm thinking all black and pearls!, 2010-03-15 17:35:58]|
|[22077441	10536835844	I am glad I'm having this show but I can't wait till it is over so I can rest and stop worrying !!, 2010-03-15 16:53:44]                                             |
|[22077441	10536809086	Honestly I don't even know 

In [51]:
 from pyspark.sql import functions as F

In [52]:
result.select(F.explode(F.arrays_zip("ner_convert.result")).alias("col"))\
.select(F.expr("col['0']").alias("sentence")).show()

+--------------------+
|            sentence|
+--------------------+
|                 I'm|
|                 I'm|
|      @Iam_MarkyMark|
|                 I'm|
|                 I'm|
|                 Lol|
|      @Iam_MarkyMark|
|              aren't|
|                 Lol|
|                Hmmm|
|                 I'm|
|            @TRenee3|
|                Imma|
|                 Lol|
|   RT @Iam_MarkyMark|
|         Flint&wanna|
|Glam&Glory Fashio...|
|                 I'm|
|               Whooo|
|                I've|
+--------------------+
only showing top 20 rows



### Creating Date Matcher
This is for extracting the dates. 

In [53]:
match_date= PretrainedPipeline("match_datetime", lang="en")

result_date= match_date.transform(test_df)

match_datetime download started this may take some time.
Approx size to download 12.9 KB
[OK!]


In [54]:
result_date.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                date|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|22077441	10538487...|[[document, 0, 18...|[[document, 0, 12...|[[token, 0, 7, 22...|[[date, 24, 28, 2...|
|22077441	10536835...|[[document, 0, 13...|[[document, 0, 11...|[[token, 0, 7, 22...|[[date, 0, 9, 201...|
|22077441	10536809...|[[document, 0, 90...|[[document, 0, 90...|[[token, 0, 7, 22...|[[date, 72, 81, 2...|
|22077441	10534149...|[[document, 0, 15...|[[document, 0, 90...|[[token, 0, 7, 22...|[[date, 4, 32, 74...|
|22077441	10530203...|[[document, 0, 16...|[[document, 0, 62...|[[token, 0, 7, 22...|[[date, 11, 20, 2...|
|22077441	10525388...|[[document, 0, 10...|[[document, 0, 42...|[[token, 0, 7, 22...|[[date, 40, 49, 2...|
|22077441	10524902...|[[document, 0, 

In [55]:
result_date.select("date.result").show(truncate=False)

+------------------------------------------------+
|result                                          |
+------------------------------------------------+
|[2020/09/02, 2010/03/15, 2015/03/01]            |
|[2010/03/15, 2015/03/01]                        |
|[2010/03/15, 2015/03/01, 7441/01/03]            |
|[7441/01/01, 2010/03/15, 2015/03/01, 2010/03/03]|
|[2010/03/15, 2015/03/01]                        |
|[2010/03/15, 2015/03/01]                        |
|[2010/03/15, 2015/03/01]                        |
|[2010/03/15, 2015/03/01, 7441/03/03]            |
|[7441/03/01, 2010/03/15, 2015/03/01]            |
|[2010/03/15, 2015/03/01]                        |
|[2010/03/15, 2015/03/01]                        |
|[2020/09/02, 2010/03/15, 2015/03/01]            |
|[2010/03/14, 2014/03/01]                        |
|[2010/03/14, 2014/03/01]                        |
|[2010/03/14, 2014/03/01, 7441/01/03]            |
|[2010/03/14, 2014/03/01, 7441/03/03]            |
|[2010/03/14, 2014/03/01]      

In [56]:
result_date.select(F.explode(F.arrays_zip("sentence.result", "date.result")).alias("col"))\
.select(F.expr("col['0']").alias("sentence"),
        F.expr("col['1']").alias("date")).show()


+--------------------+----------+
|            sentence|      date|
+--------------------+----------+
|22077441	10538487...|2020/09/02|
|                   .|2010/03/15|
|I'm thinking all ...|2015/03/01|
| 2010-03-15 17:35:58|      null|
|22077441	10536835...|2010/03/15|
| 2010-03-15 16:53:44|2015/03/01|
|22077441	10536809...|2010/03/15|
|                null|2015/03/01|
|                null|7441/01/03|
|22077441	10534149...|7441/01/01|
|                   .|2010/03/15|
|                   .|2015/03/01|
|@Iam_MarkyMark sh...|2010/03/03|
|22077441	10530203...|2010/03/15|
|I don't feel like...|2015/03/01|
|I'm tired and fee...|      null|
|         And bored .|      null|
|                   .|      null|
|And lonely	2010-0...|      null|
|22077441	10525388...|2010/03/15|
+--------------------+----------+
only showing top 20 rows



### Creating Regex Matcher.
I create regex matcher because I want to get tweet ID's. 

In [57]:
rules= '''
    \S*\d+\S*, match any word that contains numbers
'''

with open("regex_rules.txt", "w") as f:
  f.write(rules)

In [58]:
regex_matcher = RegexMatcher()\
    .setInputCols('sentence', 'document')\
    .setStrategy("MATCH_ALL")\
    .setOutputCol("regex_matches")\
    .setExternalRules(path='./regex_rules.txt', delimiter=',')

In [59]:
pipeline= Pipeline(stages=[
                           document,
                           sentence,
                           regex_matcher
])

model_regex= pipeline.fit(test_df)
result_regex= model_regex.transform(test_df)

In [60]:
result_regex.columns

['text', 'document', 'sentence', 'regex_matches']

In [61]:
result_regex.select("sentence.result").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                     |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[22077441	10538487904	Ok today I have to find something to wear for fri cuz I don't think I have time any other day this week., ., I'm thinking all black and pearls!, 2010-03-15 17:35:58]|
|[22077441	10536835844	I am glad I'm having this show but I can't wait till it is over so I can rest and stop worrying !!, 2010-03-15 16:53:44]                                             |
|[22077441	10536809086	Honestly I don't even know 

In [62]:
result_regex.select("regex_matches.result").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                  |
+------------------------------------------------------------------------------------------------------------------------+
|[22077441, 10538487904, 2010-03-15, 17:35:58, 22077441, 10538487904, 2010-03-15, 17:35:58]                              |
|[22077441, 10536835844, 2010-03-15, 16:53:44, 22077441, 10536835844, 2010-03-15, 16:53:44]                              |
|[22077441, 10536809086, 2010-03-15, 16:52:59, 22077441, 10536809086, 2010-03-15, 16:52:59]                              |
|[22077441, 10534149786, 2010-03-15, 15:42:07, 22077441, 10534149786, 2010-03-15, 15:42:07]                              |
|[22077441, 10530203659, 2010-03-15, 13:55:22, 22077441, 10530203659, 2010-03-15, 13:55:22]                              |
|[22077441, 1052

In [63]:
regex_df= result_regex.select(F.explode(F.arrays_zip("sentence.result", "regex_matches.result")).alias("col"))\
.select(F.expr("col['0']").alias("sentence"),
        F.expr("col['1']").alias("result")).show()


+--------------------+-----------+
|            sentence|     result|
+--------------------+-----------+
|22077441	10538487...|   22077441|
|                   .|10538487904|
|I'm thinking all ...| 2010-03-15|
| 2010-03-15 17:35:58|   17:35:58|
|                null|   22077441|
|                null|10538487904|
|                null| 2010-03-15|
|                null|   17:35:58|
|22077441	10536835...|   22077441|
| 2010-03-15 16:53:44|10536835844|
|                null| 2010-03-15|
|                null|   16:53:44|
|                null|   22077441|
|                null|10536835844|
|                null| 2010-03-15|
|                null|   16:53:44|
|22077441	10536809...|   22077441|
|                null|10536809086|
|                null| 2010-03-15|
|                null|   16:52:59|
+--------------------+-----------+
only showing top 20 rows



### Creating Sentiment Analysis.
By using analyze_sentimentdl_use_twitter.

In [64]:
pipeline= PretrainedPipeline("analyze_sentimentdl_use_twitter", lang="en")

result_sent= pipeline.transform(test_df)

analyze_sentimentdl_use_twitter download started this may take some time.
Approx size to download 928.3 MB
[OK!]


In [65]:
result_sent.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document| sentence_embeddings|           sentiment|
+--------------------+--------------------+--------------------+--------------------+
|22077441	10538487...|[[document, 0, 18...|[[sentence_embedd...|[[category, 0, 18...|
|22077441	10536835...|[[document, 0, 13...|[[sentence_embedd...|[[category, 0, 13...|
|22077441	10536809...|[[document, 0, 90...|[[sentence_embedd...|[[category, 0, 90...|
|22077441	10534149...|[[document, 0, 15...|[[sentence_embedd...|[[category, 0, 15...|
|22077441	10530203...|[[document, 0, 16...|[[sentence_embedd...|[[category, 0, 16...|
|22077441	10525388...|[[document, 0, 10...|[[sentence_embedd...|[[category, 0, 10...|
|22077441	10524902...|[[document, 0, 16...|[[sentence_embedd...|[[category, 0, 16...|
|22077441	10524699...|[[document, 0, 87...|[[sentence_embedd...|[[category, 0, 87...|
|22077441	10524509...|[[document, 0, 68...|[[sentence_

In [66]:
result_sent.select(F.explode(F.arrays_zip("sentence_embeddings.result", "sentiment.result")).alias("col"))\
.select(F.expr("col['0']").alias("sentence"),
        F.expr("col['1']").alias("sentiment")).show()

+--------------------+---------+
|            sentence|sentiment|
+--------------------+---------+
|22077441	10538487...| negative|
|22077441	10536835...| negative|
|22077441	10536809...| negative|
|22077441	10534149...|  neutral|
|22077441	10530203...| negative|
|22077441	10525388...| positive|
|22077441	10524902...| negative|
|22077441	10524699...| negative|
|22077441	10524509...|  neutral|
|22077441	10524432...| negative|
|22077441	10524352...| positive|
|22077441	10523840...| positive|
|22077441	10500366...| negative|
|22077441	10499584...| positive|
|22077441	10499334...| positive|
|22077441	10496093...| positive|
|22077441	10495553...| positive|
|22077441	10494759...| positive|
|22077441	10494254...| positive|
|22077441	10494154...| positive|
+--------------------+---------+
only showing top 20 rows

