In [None]:
!pip install -q  pyspark==3.4.1 spark-nlp==5.4.0

In [None]:
import sparknlp
spark = sparknlp.start()
spark

In [None]:
df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("/content/dataset.csv")
)

In [None]:
df.show(5)

+-------+--------------------+----------------+-----------------+---------+--------+-------------+----------+-----------+-----------+----------------+--------------------+--------------------+------------------+----------+--------+-------------------+---------------+--------------------+--------------------+-------------+---------------+------------------+
|     id|                text|      created_at|         username|  user_id|language|retweet_count|like_count|reply_count|quote_count|impression_count|            hashtags|            mentions|            source|is_retweet|is_reply|in_reply_to_user_id|conversation_id|user_followers_count|user_following_count|user_verified|  user_location|possibly_sensitive|
+-------+--------------------+----------------+-----------------+---------+--------+-------------+----------+-----------+-----------+----------------+--------------------+--------------------+------------------+----------+--------+-------------------+---------------+---------------

In [None]:
from pyspark.sql.functions import lower, regexp_replace, col, trim, length

df_clean = (
    df.withColumn("clean_text", lower(col("text")))
      .withColumn("clean_text", regexp_replace("clean_text", "http\\S+", ""))
      .withColumn("clean_text", regexp_replace("clean_text", "@\\w+", ""))
      .withColumn("clean_text", regexp_replace("clean_text", "#", ""))
      .filter(col("clean_text").isNotNull())
      .filter(length(trim(col("clean_text"))) > 0)
)

df_clean.select("clean_text").show(5)

+--------------------+
|          clean_text|
+--------------------+
|agent every devel...|
|night respond red...|
|here grow gas eno...|
|product significa...|
|environment decis...|
+--------------------+
only showing top 5 rows



In [None]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import (
    Tokenizer,
    SentenceDetector,
    UniversalSentenceEncoder,
    SentimentDLModel
)
from pyspark.ml import Pipeline
from pyspark.sql.functions import col


In [None]:
document = DocumentAssembler() \
    .setInputCol("clean_text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

embeddings = UniversalSentenceEncoder.pretrained(
    "tfhub_use",
    lang="en"
).setInputCols(["sentence"]) \
 .setOutputCol("sentence_embeddings")

sentiment = SentimentDLModel.pretrained(
    "sentimentdl_use_twitter",
    lang="en"
).setInputCols(["sentence_embeddings"]) \
 .setOutputCol("sentiment")

pipeline = Pipeline(stages=[
    document,
    sentence,
    embeddings,
    sentiment
])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]


In [None]:
model = pipeline.fit(df_clean)
result = model.transform(df_clean)


In [None]:
final_df = result.withColumn(
    "sentiment_label",
    col("sentiment.result")[0]
)

final_df.select("clean_text", "sentiment_label").show(10)


+--------------------+---------------+
|          clean_text|sentiment_label|
+--------------------+---------------+
|agent every devel...|       positive|
|night respond red...|       positive|
|here grow gas eno...|        neutral|
|product significa...|       positive|
|environment decis...|       positive|
|edge network wall...|       positive|
|patch for credent...|       negative|
|campaign little n...|       positive|
|according remain ...|       positive|
|backup systems en...|       negative|
+--------------------+---------------+
only showing top 10 rows



In [None]:
final_df = result.withColumn(
    "sentiment_label",
    col("sentiment.result")[0]
)

final_df.select("clean_text", "sentiment_label").show(10)


+--------------------+---------------+
|          clean_text|sentiment_label|
+--------------------+---------------+
|agent every devel...|       positive|
|night respond red...|       positive|
|here grow gas eno...|        neutral|
|product significa...|       positive|
|environment decis...|       positive|
|edge network wall...|       positive|
|patch for credent...|       negative|
|campaign little n...|       positive|
|according remain ...|       positive|
|backup systems en...|       negative|
+--------------------+---------------+
only showing top 10 rows



In [None]:
final_df.select("clean_text", "sentiment_label") \
    .write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("/content/sentiment_outputs")


In [None]:
import pandas as pd
import glob

# Read all part files
files = glob.glob("/content/sentiment_outputs/part-*.csv")

df = pd.concat([pd.read_csv(f) for f in files])

# Save as single CSV
df.to_csv("/content/sentiment_gold.csv", index=False)

df.head()


Unnamed: 0,clean_text,sentiment_label
0,patched the vulnerability quickly. team crime ...,negative
1,popular build civil exist close appear within ...,positive
2,television accept them event senior culture ta...,positive
3,pass tend political raise who usually deal. da...,positive
4,during run believe democratic forget american ...,neutral
