In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [3]:
# Create Spark session
spark = SparkSession.builder.appName("TextClassifierwithPySpark").getOrCreate()


In [4]:
# Merge the datasets
df = spark.read.parquet('Apparel_v1_00/amazon_us_reviews-train-00000-of-00005.parquet')

In [5]:
# Convert 'star_rating' column to string type
df = df.withColumn('star_rating', df['star_rating'].cast('string'))

In [6]:
# Convert 'review_body' column to string type
df = df.withColumn('review_body', df['review_body'].cast('string'))


In [7]:
# Apply label encoding based on star_rating
df = df.withColumn('label', (df['star_rating'] > '3').cast('integer'))


In [8]:
# Define the preprocess_text function
def preprocess_text(text):
    # Remove punctuation
    import re
    import string
    text = re.sub('[' + string.punctuation + ']', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespaces
    text = ' '.join(text.split())

    return text

In [9]:
# Create a UDF for the preprocess_text function
from pyspark.sql.functions import udf
preprocess_udf = udf(preprocess_text)

In [10]:
# Apply the preprocess_text function to the 'review_body' column
df = df.withColumn('review_body', preprocess_udf('review_body'))


In [11]:
# Select columns for training
df = df.select('review_body', 'label')

In [12]:
# Split the dataset into training and testing sets
trainDF, testDF = df.randomSplit([0.7, 0.3], seed=42)

In [13]:
# Define the stages for the pipeline
tokenizer = Tokenizer(inputCol='review_body', outputCol='mytokens')
stopwords_remover = StopWordsRemover(inputCol='mytokens', outputCol='filtered_tokens')
vectorizer = CountVectorizer(inputCol='filtered_tokens', outputCol='rawFeatures')
idf = IDF(inputCol='rawFeatures', outputCol='vectorizedFeatures')
svm = LinearSVC(featuresCol='vectorizedFeatures', labelCol='label')

In [14]:
# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, vectorizer, idf, svm])


In [15]:
# Fit the pipeline on the training data
svm_model = pipeline.fit(trainDF)

In [16]:
# Make predictions on the testing data
predictions = svm_model.transform(testDF)


In [17]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

In [18]:
# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 87.43%
