### Text Classification with PySpark
#### MultiClass Text Classification

#### Task
+ predict the subject category given a course title or text

#### Pyspark
+ pipenv install pyspark


In [1]:
!pip install pyspark



In [53]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [54]:
spark = SparkSession.builder.appName("TextClassifierwithPySpark").getOrCreate()

In [78]:
# Load the first parquet file
df1 = spark.read.parquet('Apparel_v1_00/amazon_us_reviews-train-00000-of-00005.parquet')

In [80]:
# Load the second parquet file
df2 = spark.read.parquet('Apparel_v1_00/amazon_us_reviews-train-00000-of-00004.parquet')


In [81]:
# Merge the datasets
df = df1.union(df2)

In [82]:
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   32158956|R1KKOXHNI8MSXU|B01KL6O72Y|      24485154|Easy Tool Stainle...|         Apparel|          4|            0|          0|   0|                1|★ THESE REALLY DO...|These Really Do W...| 2013-01-14|
|         US|    2714559|R26SP2OPDK4HT7|B01ID3ZS5W|     363128556|V28 Women Cowl Ne...|         Apparel|          5|    

In [83]:
# Convert 'star_rating' column to string type
df = df.withColumn('star_rating', df['star_rating'].cast('string'))

# Convert 'review_body' column to string type
df = df.withColumn('review_body', df['review_body'].cast('string'))

# Apply label encoding based on star_rating
df = df.withColumn('label', (df['star_rating'] > '3').cast('integer'))

In [84]:
# Define the preprocess_text function
def preprocess_text(text):
    # Remove punctuation
    import re
    import string
    text = re.sub('[' + string.punctuation + ']', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespaces
    text = ' '.join(text.split())

    return text

In [85]:
# Create a UDF for the preprocess_text function
from pyspark.sql.functions import udf
preprocess_udf = udf(preprocess_text)

In [86]:
# Apply the preprocess_text function to the 'review_body' column
df = df.withColumn('review_body', preprocess_udf('review_body'))


In [87]:
# Select columns for training
df = df.select('review_body', 'label')

In [88]:
# Split the dataset into training and testing sets
trainDF, testDF = df.randomSplit([0.7, 0.3], seed=42)

In [89]:
# Define the stages for the pipeline
tokenizer = Tokenizer(inputCol='review_body', outputCol='mytokens')
stopwords_remover = StopWordsRemover(inputCol='mytokens', outputCol='filtered_tokens')
vectorizer = CountVectorizer(inputCol='filtered_tokens', outputCol='rawFeatures')
idf = IDF(inputCol='rawFeatures', outputCol='vectorizedFeatures')
lr = LogisticRegression(featuresCol='vectorizedFeatures', labelCol='label')


In [90]:
# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, vectorizer, idf, lr])


### Feature Extraction
#### Build Features From Text
+ CountVectorizer
+ TFIDF
+ WordEmbedding
+ HashingTF
+ etc

In [91]:
# Fit the pipeline on the training data
lr_model = pipeline.fit(trainDF)


In [92]:
# Make predictions on the testing data
predictions = lr_model.transform(testDF)

In [93]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

In [94]:
# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 87.63%


In [103]:

# Create a single prediction DataFrame
single_prediction = spark.createDataFrame([("I really like that it gives me backache",)], ["review_body"])

# Preprocess the single prediction data
single_prediction = single_prediction.withColumn('review_body', preprocess_udf('review_body'))


In [104]:
# Make a single prediction
single_prediction = lr_model.transform(single_prediction)

# Show the prediction result
single_prediction.select('review_body', 'rawPrediction', 'probability', 'prediction').show(truncate=False)

+---------------------------------------+----------------------------------------+----------------------------------------+----------+
|review_body                            |rawPrediction                           |probability                             |prediction|
+---------------------------------------+----------------------------------------+----------------------------------------+----------+
|i really like that it gives me backache|[-2.8372515428255287,2.8372515428255287]|[0.05534405463567931,0.9446559453643207]|1.0       |
+---------------------------------------+----------------------------------------+----------------------------------------+----------+

