In [None]:
path = r"/home/mb/college/bda/BDA/datasets/twitter.csv"

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when, col

spark = SparkSession.builder.appName("Twitter Sentiment Analysis").getOrCreate()

df = spark.read.csv(path, inferSchema=True, header=True)

# df.show()

# df.select("label").distinct().show()

df = df.filter(col("tweet").isNotNull()).filter(col("label").isNotNull())

# df.show()

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashingTF = HashingTF(inputCol="filtered_words", outputCol="term_freq", numFeatures=5000)
idf = IDF(inputCol="term_freq", outputCol="idf")
lr = LogisticRegression(featuresCol="idf", labelCol="label")

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

train, test = df.randomSplit([0.8, 0.2], seed=42)

model = pipeline.fit(train)

predictions = model.transform(test)

predictions.show()



evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy:.4f}")

spark.stop()

25/04/21 23:12:06 WARN Utils: Your hostname, manthan resolves to a loopback address: 127.0.1.1; using 192.168.1.9 instead (on interface wlp0s20f3)
25/04/21 23:12:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/21 23:12:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/21 23:12:13 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|label|               tweet|               words|      filtered_words|           term_freq|                 idf|       rawPrediction|         probability|prediction|
+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  3|    0|  bihday your maj...|[, , bihday, your...|[, , bihday, maje...|(5000,[2449,3372,...|(5000,[2449,3372,...|[179.740073872864...|           [1.0,0.0]|       0.0|
|  7|    0| @user camping to...|[, @user, camping...|[, @user, camping...|(5000,[1375,1883,...|(5000,[1375,1883,...|[104.927817335110...|           [1.0,0.0]|       0.0|
|  9|    0|we won!!! love th...|[we, won!!!, love...|[won!!!, love, la...|(5000,[254,374,82...|(5000,[254,374,82...|[155.529059254670...|           [1