## Sentiment Analysis of 600,000 Samples

Dataset Source: https://www.kaggle.com/datasets/tariqsays/sentiment-dataset-with-1-million-tweets

##### Import Necessary Libraries

In [0]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, StructType, StructField, DoubleType
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

##### Versions of Libraries, Modules, Frameworks Used in This Project

In [0]:
print("Spark NLP version: ".rjust(24), sparknlp.version())
print("Apache Spark version: ".rjust(24), spark.version)

     Spark NLP version:  4.3.1
  Apache Spark version:  3.3.0


#### Create Functions Used Throughout Project

##### Function to Ingest Dataset

In [0]:
def ingest_dataset(file_location: str, 
                   schema: StructType, 
                   delimiter: str = ',' 
                  ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This function returns a dataframe aligned with the schema.
    '''
    file_type = "csv"
    infer_schema = "false"
    first_row_is_header = "true"
    
    dataset = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .schema(schema)\
      .load(file_location)
    
    return dataset

##### Function to Remove Imbalance in Classes (Outputs)

In [0]:
def balance_dataset(dataset, 
                    unique_label_values, 
                    new_schema, 
                    samples = 20000, 
                    label_col = "label" 
                   ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This function balances the number of samples from 
    each class to about the value of 'samples' that is
    passed into the function.
    '''
    temp_df =  spark.createDataFrame([], new_schema)
    new_df =  spark.createDataFrame([], new_schema)
    
    for ulab in unique_label_values:
        # extract df of only the desired labels
        temp_df = dataset.where(F.col(label_col).isin(ulab))
        
        ratio = round(samples/temp_df.count(),4)
        # sample it to desired number of samples
        if ratio > 1.0:
            # Oversample
            temp_df = temp_df.sample(True, ratio, seed=42)
        elif ratio < 1.0:
            # Undersample
            temp_df = temp_df.sample(False, ratio, seed=42)
        elif ratio == 1.0:
            # Just use the existing dataframe
            pass
        # concatenate it to the new_df
        new_df = new_df.union(temp_df)
    
    return new_df

##### Function for Evaluating Metrics

In [0]:
def mc_evaluate_with_spark_metrics(dataset: pyspark.sql.dataframe.DataFrame, 
                                metrics: [str], 
                                model_name: str, 
                                label_col: str = "label", 
                                predictionCol: str = "prediction" 
                               ) -> None:
    '''
    Calculate & display metrics for a multiclass classification analysis.
    '''
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    
    print("+---------------------------------------------+")
    print("|  " + model_name.center(41) + "  |")
    print("+---------------------------------------------+")
    print("|   %s  |  %s   |" % ("Metric".rjust(20), "Value".ljust(14)))
    print("+---------------------------------------------+")
    for x in metrics:
        evaluator = MulticlassClassificationEvaluator(labelCol=label_col, \
                                                      predictionCol=predictionCol, \
                                                      metricName=x) 
        score = evaluator.evaluate(dataset)
        print("|   %s  |  %s   |" % (x.rjust(20), str(round(score, 6)).ljust(14)))
        print("+---------------------------------------------+")

#### Data Ingestion & Preprocessing

##### Data Ingestion

In [0]:
file_location = "/FileStore/tables/dataset.csv"

orig_schema = StructType([
    StructField("text", StringType(), True),
    StructField("language", StringType(), True),
    StructField("label", StringType(), True)
])

df = ingest_dataset(file_location, orig_schema, ',')
print(df.count())
display(df)

##### Data Preprocessing

In [0]:
df = df.filter(df["language"] == "en")

df = df.na.drop()
df = df.dropDuplicates()

print(df.count())
display(df)

##### Remove Unnecessary Feature & Class Imbalance

In [0]:
df = df.drop("language")

bal_schema = StructType([
    StructField("text", StringType(), True),
    StructField("label", StringType(), True),
])

labels_to_balance = [x.label for x in df.select("label").distinct().collect()]

'''
The reason for the lower number for each class is that I had 
to down sample this dataframe due to memory restrictions on 
Databrick's community edition.
'''
ds = balance_dataset(df, 
                    labels_to_balance, 
                    bal_schema, 
                    150000)

display(ds)

##### Split Dataset into Training & Testing Dataset

In [0]:
train_ds, test_ds = ds.randomSplit(weights=[0.80, 0.20], seed=42)

train_ds = train_ds.persist()
test_ds = test_ds.persist()

print(f"There are {train_ds.count()} samples in the training dataset.")
print(f"There are {test_ds.count()} samples in the testing dataset.")

There are 479932 samples in the training dataset.
There are 120292 samples in the testing dataset.


#### Build & Train Model

##### Build Pipeline Stages

In [0]:
# Document Assembler
doc = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") \
    .setCleanupMode("shrink")

# Universal Sentence Encoder
use = UniversalSentenceEncoder.pretrained() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence_embeddings")

# Sentiment Analysis Using Deep Learning Approach
doc_sa = ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment") \
    .setLabelColumn("label") \
    .setBatchSize(16) \
    .setMaxEpochs(10) \
    .setLr(5e-3)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][OK!]


##### Build Pipeline

In [0]:
sa_pipe = Pipeline().setStages([
    doc,
    use,
    doc_sa
])

##### Fit Pipeline to Training Dataset (Train Model)

In [0]:
sa_model = sa_pipe.fit(train_ds)

#### Evaluate Model

##### Generate Predictions Using Testing Dataset

In [0]:
train_ds = train_ds.unpersist()

predictions = sa_model.transform(test_ds)

##### Prepare Predictions for Metrics Function

In [0]:
preds = predictions.select(F.col("label").alias("label"), F.col("sentiment.result").getItem(0).alias("prediction"))

preds = preds.persist()
test_ds = test_ds.unpersist()

##### Prepare Predictions for Metrics Function (Part 2)

In [0]:
str_to_dbl_converter = {"litigious" : "0", "negative" : "1", "uncertainty" : "2", "positive" : "3"}

preds = preds.replace(str_to_dbl_converter) \
            .withColumn("label", F.col("label").cast(DoubleType())) \
            .withColumn("prediction", F.col("prediction").cast(DoubleType()))

display(preds)

label,prediction
3.0,3.0
3.0,3.0
3.0,3.0
3.0,3.0
3.0,3.0
3.0,0.0
3.0,3.0
3.0,3.0
3.0,3.0
3.0,3.0


##### Metrics Evaluation Function

In [0]:
metrics_to_eval = ["accuracy", "f1", "weightedPrecision", "weightedRecall"]

mc_evaluate_with_spark_metrics(preds, 
                                metrics_to_eval, 
                                "Sentiment Analysis of 600,000 Tweets", 
                                label_col = "label", 
                                predictionCol = "prediction")

+---------------------------------------------+
|     Sentiment Analysis of 600,000 Tweets    |
+---------------------------------------------+
|                 Metric  |  Value            |
+---------------------------------------------+
|               accuracy  |  0.748562         |
+---------------------------------------------+
|                     f1  |  0.748031         |
+---------------------------------------------+
|      weightedPrecision  |  0.748284         |
+---------------------------------------------+
|         weightedRecall  |  0.748562         |
+---------------------------------------------+


##### End Spark Session

In [0]:
preds = preds.unpersist()

spark.stop()

### Notes & Other Takeaways From This Project
****
- 
****
- I used the ClassifierDLApproach instead of the SentimentDLApproach because this approach has returned superior results in past experience.
****
- While I wish I could run the full dataframe for this project, I am elated that Databricks has a free community edition with which I can improve my skillset. After multiple tests, this is the maximum number of samples allowed for this dataset.
****