In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, when, rand
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Start Spark session
spark = SparkSession.builder.appName("FixedSentimentAnalysis").getOrCreate()
start_time = time.time()

# Load dataset
df = spark.read.csv("twitter.csv", header=True, inferSchema=True).select("tweet", "label")

# Clean tweets
df_clean = df.withColumn("tweet", lower(col("tweet")))
df_clean = df_clean.withColumn("tweet", regexp_replace("tweet", r"http\S+|www\S+", ""))
df_clean = df_clean.withColumn("tweet", regexp_replace("tweet", r"@\w+", ""))
df_clean = df_clean.withColumn("tweet", regexp_replace("tweet", r"#", ""))
df_clean = df_clean.withColumn("tweet", regexp_replace("tweet", r"[^\w\s]", ""))
df_clean = df_clean.withColumn("tweet", regexp_replace("tweet", r"\d+", ""))

# Handle class imbalance
positive = df_clean.filter(col("label") == 1)
negative = df_clean.filter(col("label") == 0)
neg_sample = negative.sample(False, positive.count() / negative.count(), seed=42)
df_train = neg_sample.union(positive).orderBy(rand())

# Test sentences
test_sentences = [
    ("I hate this product! It's the worst!",),
    ("This is amazing! I love it.",),
    ("Not bad, but could be better.",),
    ("Absolutely terrible experience.",),
    ("Had a fantastic time using this app.",),
]
df_test = spark.createDataFrame(test_sentences, ["tweet"])

# Clean test tweets
for pattern in [r"http\S+|www\S+", r"@\w+", r"#", r"[^\w\s]", r"\d+"]:
    df_test = df_test.withColumn("tweet", regexp_replace("tweet", pattern, ""))
df_test = df_test.withColumn("tweet", lower(col("tweet")))

# ML Pipeline
tokenizer = RegexTokenizer(inputCol="tweet", outputCol="words", pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=10000)
idf = IDF(inputCol="raw_features", outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")  # <-- use original label

pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, lr])

# Train model
model = pipeline.fit(df_train)

# Predict
predictions = model.transform(df_test)

# Map prediction
predictions = predictions.withColumn(
    "sentiment",
    when(col("prediction") == 1.0, "Negative").otherwise("Positive")
)

print(f"\nExecution Time: {time.time() - start_time:.2f} seconds")
predictions.select("tweet", "prediction", "sentiment").show(truncate=False)

# Stop Spark
spark.stop()



Execution Time: 48.39 seconds
+-----------------------------------+----------+---------+
|tweet                              |prediction|sentiment|
+-----------------------------------+----------+---------+
|i hate this product its the worst  |1.0       |Negative |
|this is amazing i love it          |0.0       |Positive |
|not bad but could be better        |0.0       |Positive |
|absolutely terrible experience     |1.0       |Negative |
|had a fantastic time using this app|0.0       |Positive |
+-----------------------------------+----------+---------+

