In [None]:
#Installing java, apache spark, hadoop and findspark
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.2
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

--2023-08-19 10:25:16--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2023-08-19 10:25:16--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’


2023-08-19 10:25:16 (88.3 MB/s) - written to stdout [1191/1191]

Installing PySpark 3.2.3 and Spark NLP 5.0.2
setup Colab for PySpark 3.2.3 and Spark NLP 5

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install spark-nlp==5.0.2
!pip install pyspark



In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
#Begin session
import sparknlp #library for sentiment analysis from Jonsnowlabs
spark = sparknlp.start()

In [None]:
#import data
data = spark.read.csv('tokyo_2020_tweets.csv', header = True, inferSchema = True)

In [None]:
#select corpus column from dataset
df = data.select('text')
df = df.dropna()
df.show()

+--------------------+
|                text|
+--------------------+
| Let the party begin|
|Congratulations #...|
|   Big Breaking Now |
|     Q4: 🇬🇧3-1🇿🇦|
|All I can think o...|
|#Tokyo2020 #Olympics|
|Can't help but ch...|
|@inquirerdotnet @...|
|    Q3 🇨🇦 1-4 🇩🇪|
|Hearty Congratula...|
|                 0.0|
|Gymnastics ❤️ #To...|
|Morning everyone!...|
| #Tokyo2020 #Tennis |
|Up next for Carlo...|
|Congrates @miraba...|
|The wait for a we...|
|#Tokyo2020   #Mir...|
|#Tokyo2020 #Olymp...|
|Well done to #Tea...|
+--------------------+
only showing top 20 rows



In [None]:
#Clean the corpus by removing any symbols
from pyspark.sql.functions import udf, col, lower, regexp_replace #switchup the order of stuff
# Clean corpus
df_clean = df.select((lower(regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('cleaned_text')))
df.show()

+--------------------+
|                text|
+--------------------+
| Let the party begin|
|Congratulations #...|
|   Big Breaking Now |
|     Q4: 🇬🇧3-1🇿🇦|
|All I can think o...|
|#Tokyo2020 #Olympics|
|Can't help but ch...|
|@inquirerdotnet @...|
|    Q3 🇨🇦 1-4 🇩🇪|
|Hearty Congratula...|
|                 0.0|
|Gymnastics ❤️ #To...|
|Morning everyone!...|
| #Tokyo2020 #Tennis |
|Up next for Carlo...|
|Congrates @miraba...|
|The wait for a we...|
|#Tokyo2020   #Mir...|
|#Tokyo2020 #Olymp...|
|Well done to #Tea...|
+--------------------+
only showing top 20 rows



In [None]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline, Finisher
from sparknlp.annotator import (SentenceDetector,Tokenizer,Lemmatizer,SentimentDetector)
import pyspark.sql.functions as F
from sparknlp.common import *

In [None]:
#download the sentiment and lemmatization dictionaries from the web
! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/lemma-corpus-small/lemmas_small.txt -P /tmp
! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/default-sentiment-dict.txt -P /tmp

--2023-08-19 10:30:21--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/lemma-corpus-small/lemmas_small.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.232.24, 52.216.53.216, 54.231.197.240, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.232.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 189437 (185K) [text/plain]
Saving to: ‘/tmp/lemmas_small.txt’


2023-08-19 10:30:22 (2.12 MB/s) - ‘/tmp/lemmas_small.txt’ saved [189437/189437]

--2023-08-19 10:30:22--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/default-sentiment-dict.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.85.110, 52.216.43.32, 52.217.225.120, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.85.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 289 [text/plain]
Saving to: ‘/tmp/default-sentiment-dict.txt’


2023-08-19 10:30:22 (5.42 MB/s) - ‘/tmp/default-

In [None]:
# Transforms corpus to document annotation for pipeline model
assembler = (DocumentAssembler().setInputCol("cleaned_text").setOutputCol("assembled_text"))

In [None]:
#Sentence Detection for pipeline model
sentence_detect = SentenceDetector().setInputCols(["assembled_text"]).setOutputCol("detected_sent")

In [None]:
#Tokenization
tokenizer = Tokenizer().setInputCols(["detected_sent"]).setOutputCol("tokenized")

In [None]:
#Lemmatization
lemma = Lemmatizer().setInputCols("tokenized").setOutputCol("lemmatized_text").setDictionary("/tmp/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")

In [None]:
#Sentiment Detection
sentiment_detector= (SentimentDetector().setInputCols(["lemmatized_text", "detected_sent"])
    .setOutputCol("result")
    .setDictionary("/tmp/default-sentiment-dict.txt", ","))

In [None]:
#Finisher
finit= (Finisher().setInputCols(["result"]).setOutputCols("sentiments"))

In [None]:
# Define the pipeline
pipes = Pipeline(stages=[assembler,sentence_detect,tokenizer,lemma,sentiment_detector,finit])

In [None]:
# Fit-transform to get the sentiment predictions
predictor = pipes.fit(df_clean).transform(df_clean).collect()

In [None]:
from pyspark.sql.types import StringType
stuvvs = spark.createDataFrame(predictor).show(truncate = 200)

+-------------------------------------------------------------------------------------------------------------------------+----------+
|                                                                                                             cleaned_text|sentiments|
+-------------------------------------------------------------------------------------------------------------------------+----------+
|                                                                                                      let the party begin|[positive]|
|                                                                                   congratulations tokyo httpstcoofkmsukq|[positive]|
|                                                                                                        big breaking now |[positive]|
|                                                                                                                       q |[positive]|
|  all i can think of every time i watch the rings even