In [None]:
# Khởi tạo SparkSession
spark = SparkSession.builder.appName("sentimentanalysis").getOrCreate()

# Đọc dữ liệu
data = spark.read.csv("/content/Amazon_product_review.csv", header=True, inferSchema=True).na.drop()

# Hiển thị dữ liệu
df.show(10)

+--------------------+---------+
|        short_review|Sentiment|
+--------------------+---------+
|       Does not work| negative|
|This is a great w...| positive|
|It works great so...| positive|
|This product was ...| positive|
|it works but it h...| positive|
|Excellent product...| positive|
|This product work...| positive|
|The unit sort of ...| negative|
|I fly in the far ...| positive|
|good sound, looks...| positive|
+--------------------+---------+
only showing top 10 rows



In [None]:
# Tạo train set và test set
data = df.randomSplit([0.8, 0.2])
train_set = data[0]
test_set = data[1]
train_count = train_set.count()
test_count = test_set.count()
print("Train Count:", train_count)
print("Test Count:", test_count)

Train Count: 16034
Test Count: 3966


In [None]:
train_set = train_set.na.drop(subset=["short_review"])
test_set = test_set.na.drop(subset=["short_review"])
train_set = train_set.na.drop(subset=["Sentiment"])
test_set = test_set.na.drop(subset=["Sentiment"])


In [None]:
# Mô hình LSTM
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.linalg import Vectors


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, Word2Vec, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tokenizer = RegexTokenizer(inputCol="short_review", outputCol="words", pattern="\\W")
wordsData = tokenizer.transform(train_set).select("words", "Sentiment")

word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="words", outputCol="features")
word2VecModel = word2Vec.fit(wordsData)
result = word2VecModel.transform(wordsData)


indexer = StringIndexer(inputCol="Sentiment", outputCol="label", handleInvalid="keep").fit(result)
indexedData = indexer.transform(result)

layers = [100, 64, 32, 3]
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1000, seed=1234)


pipeline = Pipeline(stages=[tokenizer, word2Vec, indexer, trainer])


model_lstm_like = pipeline.fit(train_set)

predictions_lstm_like = model_lstm_like.transform(test_set)

# Đánh giá độ chính xác
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_lstm_like)
print("Accuracy of the LSTM-like model: {:.2f}%".format(accuracy * 100))


Accuracy of the LSTM-like model: 83.59%
