In [1]:
!pip3 install pyspark wordcloud matplotlib numpy textblob

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.1.1,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1 pyspark-shell'

In [3]:
import pyspark
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover


In [4]:
spark = SparkSession \
        .builder \
        .master("spark://192.168.56.110:7077") \
        .config("spark.sql.streaming.schemaInference", True) \
        .appName("Sentiment Analysis Spark") \
        .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

# To always shoe the results of Dataframes and improve the formatting output
spark.conf.set("spark.sql.repl.eagerVal.enabled", True)

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/denni/.ivy2/cache
The jars for the packages stored in: /home/denni/.ivy2/jars
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-65d38db4-60b4-4784-9027-c9aaaa65b1df;1.0
	confs: [default]
	found org.apache.spark#spark-streaming-kafka-0-10_2.12;3.1.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.1 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.1 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 1168ms :: artifact

In [5]:
tweets = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "192.168.56.110:9092") \
    .option("subscribe", "twitter-data") \
    .option("startingOffsets", "earliest") \
    .option("includeHeaders", "true") \
    .load()

tweets.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)
 |-- headers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: binary (nullable = true)



In [6]:
from textblob import TextBlob
from deep_translator import GoogleTranslator

def preprocessing(tweets):
    df = tweets.select(col("value").alias("text"))
    df = df.na.replace('', None)
    df = df.na.drop()
    df = df.withColumn('text', regexp_replace('text', r'http\S+', ''))
    df = df.withColumn('text', regexp_replace('text', '#', ''))
    df = df.withColumn('text', regexp_replace('text', 'RT', ''))
    df = df.withColumn('text', regexp_replace('text', ':', ''))
                       
    return df
def sentiment_analysis(text):
#     translated = GoogleTranslator(source='auto', target='en').translate(text)
    analysis = TextBlob(text).sentiment.polarity
    if analysis > 0:
        return 1
    else:               
        return 0 
                             
def text_classification(df):
    sentiment_analysis_udf = udf(sentiment_analysis, StringType())
    df = df.withColumn("label", sentiment_analysis_udf("text"))
                       
    return df

In [7]:
def forEachBatch(df, df_id):
    df = preprocessing(df)
    df = text_classification(df)
    
    data = df.withColumn("label", col("label").cast(IntegerType()))
    
    divided_data = data.randomSplit([0.7, 0.3])
    training_data = divided_data[0].na.drop()
    testing_data = divided_data[1].na.drop()

    train_rows = training_data.count()
    test_rows = testing_data.count()
    
    tokenizer = Tokenizer(inputCol="text", outputCol="SentimentWords")
    tokenized_train = tokenizer.transform(training_data)
                         
    swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                        outputCol="MeaningfulWords")
    sw_removed_train = swr.transform(tokenized_train)
                         
    hash_tf = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
    numeric_train_data = hash_tf.transform(sw_removed_train).select("label","MeaningfulWords", "features")
                         
                         
    lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.01)
    model = lr.fit(numeric_train_data)
    print("Training is done!")

    tokenized_test = tokenizer.transform(testing_data)
    sw_removed_test = swr.transform(tokenized_test)
    numeric_test = hash_tf.transform(sw_removed_test).select(
        'label','MeaningfulWords', 'features' 
    )

    prediction = model.transform(numeric_test)
    prediction_final = prediction.select("MeaningfulWords", "prediction", "label")

    approve_prediction = prediction_final.filter(
        prediction_final['prediction'] == 1
    ).count()
    
    reprove_prediction = prediction_final.filter(
        prediction_final['prediction'] == 0
    ).count()
    
    correct_prediction = prediction_final.filter(
        prediction_final['prediction'] == prediction_final['label']).count()
    total_data = prediction_final.count()

    print("correct prediction:", correct_prediction, "total data:", total_data, 
         ", accuracy:", correct_prediction/total_data * 100, "%")
    
    print("--------------------------------------------------------------")
    print("LULA:")
    print("Aprovacao: ", format(approve_prediction/total_data * 100, ".2f"), "%")
    print("Reprovacao: ", format(reprove_prediction/total_data * 100, ".2f"), "%")

In [8]:
# Read the data stream. 
try:
    query_stream_memory = tweets \
        .writeStream \
        .foreachBatch(forEachBatch) \
        .trigger(processingTime="10 seconds") \
        .outputMode("append") \
        .start()
    
except KeyboardInterrupt:
    print("Finalizando streaming...")

22/09/22 20:42:13 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-4365cc5d-7d2a-4b54-ba75-e692bfab51d3. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/09/22 20:42:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/09/22 20:42:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

Training is done!


                                                                                

correct prediction: 39 total data: 59 , accuracy: 66.10169491525424 %
--------------------------------------------------------------
LULA:
Aprovacao:  18.64 %
Reprovacao:  81.36 %


22/09/22 20:42:50 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 36804 milliseconds
                                                                                

Training is done!


                                                                                

correct prediction: 5 total data: 12 , accuracy: 41.66666666666667 %
--------------------------------------------------------------
LULA:
Aprovacao:  91.67 %
Reprovacao:  8.33 %


22/09/22 20:43:02 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 11920 milliseconds
                                                                                

Training is done!


22/09/22 20:43:14 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 11697 milliseconds


correct prediction: 2 total data: 6 , accuracy: 33.33333333333333 %
--------------------------------------------------------------
LULA:
Aprovacao:  0.00 %
Reprovacao:  100.00 %


[Stage 76:>                                                         (0 + 1) / 1]                                                                                

Training is done!


[Stage 88:>                                                         (0 + 1) / 1]                                                                                

correct prediction: 4 total data: 7 , accuracy: 57.14285714285714 %
--------------------------------------------------------------
LULA:
Aprovacao:  0.00 %
Reprovacao:  100.00 %


                                                                                

Training is done!


                                                                                

correct prediction: 4 total data: 4 , accuracy: 100.0 %
--------------------------------------------------------------
LULA:
Aprovacao:  25.00 %
Reprovacao:  75.00 %


22/09/22 20:43:35 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 11181 milliseconds
                                                                                

Training is done!


[Stage 138:>                                                        (0 + 1) / 1]                                                                                

correct prediction: 6 total data: 9 , accuracy: 66.66666666666666 %
--------------------------------------------------------------
LULA:
Aprovacao:  33.33 %
Reprovacao:  66.67 %


[Stage 149:>                                                        (0 + 1) / 1]                                                                                

Training is done!


[Stage 160:>                                                        (0 + 1) / 1]                                                                                

correct prediction: 0 total data: 5 , accuracy: 0.0 %
--------------------------------------------------------------
LULA:
Aprovacao:  0.00 %
Reprovacao:  100.00 %


                                                                                

Training is done!
correct prediction: 3 total data: 4 , accuracy: 75.0 %
--------------------------------------------------------------
LULA:
Aprovacao:  0.00 %
Reprovacao:  100.00 %
