<a href="https://colab.research.google.com/github/Brand-Sentiment-Tracking/dev-sentiment-package/blob/main/johnsnow/Comparison_model_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Class for Sentiment Analysis for News Articles**

## Colab Setup

In [1]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.1.2 spark-nlp

# Install Spark NLP Display lib
! pip install --upgrade -q spark-nlp-display

In [2]:
import sparknlp
import pandas as pd
import random
import time
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
from tabulate import tabulate
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from sparknlp_display import NerVisualizer

# spark = sparknlp.start(gpu=False)
spark = sparknlp.start(gpu=True)

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.4.3
Apache Spark version:  3.1.2


## Define the Brand Identification Class

In [3]:
def get_brand(row_list):
    if not row_list: # If the list is empty
        return "None"

    else:
        # Create a pandas df with entity names and types
        data = [[row.result, row.metadata['entity']] for row in row_list]
        df_pd = pd.DataFrame(data, columns = ['Entity', 'Type'])
      
        # Filter only ORGs
        df_pd = df_pd[df_pd["Type"] == "ORG"]

        # Rank the ORGs by frequencies
        ranked_df = df_pd["Entity"].value_counts() # a Pandas Series object
            
        # If no ORG identified in headline, return None
        if len(ranked_df.index) == 0:
           return "None"

        # If only one ORG appears in headline, return it
        elif len(ranked_df.index) == 1:
           return ranked_df.index[0]

        # If one ORG appear more than the others, return that one 
        elif ranked_df[0] > ranked_df[1]:
            return ranked_df.index[0] 

        else: # If multiple ORGs appear the same time, return randomly (TO BE MODIFIED)
            return random.choice([ranked_df.index[0], ranked_df.index[1]])
            # TO DO: break even - Wikidata for article body #

In [4]:
class BrandIdentification:
    def __init__(self, MODEL_NAME):
        self.MODEL_NAME = MODEL_NAME

        # Define Spark NLP pipeline 
        documentAssembler = DocumentAssembler() \
            .setInputCol('text') \
            .setOutputCol('document')

        tokenizer = Tokenizer() \
            .setInputCols(['document']) \
            .setOutputCol('token')

        # ner_dl and onto_100 model are trained with glove_100d, so the embeddings in the pipeline should match
        if (self.MODEL_NAME == "ner_dl") or (self.MODEL_NAME == "onto_100"):
            embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
                .setInputCols(["document", 'token']) \
                .setOutputCol("embeddings")

        # Bert model uses Bert embeddings
        elif self.MODEL_NAME == "ner_dl_bert":
            embeddings = BertEmbeddings.pretrained(name='bert_base_cased', lang='en') \
                .setInputCols(['document', 'token']) \
                .setOutputCol('embeddings')

        ner_model = NerDLModel.pretrained(MODEL_NAME, 'en') \
            .setInputCols(['document', 'token', 'embeddings']) \
            .setOutputCol('ner')

        ner_converter = NerConverter() \
            .setInputCols(['document', 'token', 'ner']) \
            .setOutputCol('ner_chunk')

        nlp_pipeline = Pipeline(stages=[
            documentAssembler, 
            tokenizer,
            embeddings,
            ner_model,
            ner_converter
        ])
        
        # Create the pipeline model
        empty_df = spark.createDataFrame([['']]).toDF('text') # An empty df with column name "text"
        self.pipeline_model = nlp_pipeline.fit(empty_df)


    def predict_brand(self, text): # text could be a pandas dataframe or a Spark dataframe (with a column "text"), a list of strings
        # Run the pipeline for the text
        if isinstance(text, pd.DataFrame): 
            text_df = spark.createDataFrame(text) # If input a pandas dataframe
        elif isinstance(text, list): 
            text_df = spark.createDataFrame(pd.DataFrame({'text': text})) # If input a list of strings
        elif isinstance(text, str): 
            text_df = spark.createDataFrame(pd.DataFrame({'text': text}, index=[0])) # If input a single string
        else: text_df = text

        df_spark = self.pipeline_model.transform(text_df) 

        # Improve speed of identification using Spark User-defined function
        pred_brand = F.udf(lambda z: get_brand(z), StringType()) # Output a string
        # spark.udf.register("pred_brand", pred_brand)

        df_spark_combined = df_spark.withColumn('Predicted_brand', pred_brand('ner_chunk'))
        df_spark_combined = df_spark_combined.select("text", "Predicted_brand")
        # df_spark_combined.show(100)
        
        # Remove all rows with no brands detected
        df_spark_final=df_spark_combined.filter(df_spark_combined.Predicted_brand != 'None')
        df_spark_final.show(100)

        return df_spark_final


## Define the Senitment Identification Class

In [5]:
class SentimentIdentification:

    def __init__(self, MODEL_NAME):
        """Creates a class for sentiment identication using specified model.

        Args:
          MODEL_NAME: Name of the Spark NLP pretrained pipeline.
        """

        # Create the pipeline instance
        self.MODEL_NAME = MODEL_NAME

        # Define pipeline to train
        if self.MODEL_NAME == "untrained_pipeline":
            document = DocumentAssembler()\
                .setInputCol("text")\
                .setOutputCol("document")

            use = UniversalSentenceEncoder.pretrained() \
            .setInputCols(["document"])\
            .setOutputCol("sentence_embeddings")

            # the classes/labels/categories are in category column
            sentimentdl = SentimentDLApproach()\
              .setInputCols(["sentence_embeddings"])\
              .setOutputCol("class")\
              .setLabelColumn("label")\
              .setMaxEpochs(5)\
              .setEnableOutputLogs(True)

            pipeline = Pipeline(
                stages = [
                    document,
                    use,
                    sentimentdl
                ])

            self.pipeline_model = pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

          # Create a custom pipline if requested
        if self.MODEL_NAME == "custom_pipeline": # https://nlp.johnsnowlabs.com/2021/11/03/bert_sequence_classifier_finbert_en.html
            document_assembler = DocumentAssembler() \
                .setInputCol('text') \
                .setOutputCol('document')

            tokenizer = Tokenizer() \
                .setInputCols(['document']) \
                .setOutputCol('token')

            sequenceClassifier = BertForSequenceClassification \
                  .pretrained('bert_base_sequence_classifier_imdb', 'en') \
                  .setInputCols(['token', 'document']) \
                  .setOutputCol('class') \
                  .setCaseSensitive(True) \
                  .setMaxSentenceLength(512)
            # bert_sequence_classifier_finbert
            pipeline = Pipeline(stages=[
                document_assembler,
                tokenizer,
                sequenceClassifier
            ])

            self.pipeline_model = pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

        else:
            self.pipeline_model = PretrainedPipeline(self.MODEL_NAME, lang = 'en')


    def predict_string_list(self, string_list):
        """Predicts sentiment of the input list of strings.

        Args:
          string_list: List of strings to classify.
        """
 
        # Annotate input text using pretrained model

        if self.MODEL_NAME == "custom_pipeline":
            pipeline_annotator = LightPipeline(self.pipeline_model) # Convert the pipeline to an annotator
        else:
            pipeline_annotator = self.pipeline_model

        annotations =  pipeline_annotator.annotate(string_list)

        if self.MODEL_NAME  == "custom_pipeline" or  self.MODEL_NAME == "classifierdl_bertwiki_finance_sentiment_pipeline":
            return [annotation['class'][0] for annotation in annotations] # Return the sentiment list of strings if using bert pipeline
        else:
            return [annotation['sentiment'][0] for annotation in annotations] # if using imdb model

    def predict_dataframe(self, df):
        """Annotates the input dataframe with the classification results.

        Args:
          df : Pandas or Spark dataframe to classify (must contain a "text" column)
        """

        if isinstance(df, pd.DataFrame):
            # Convert to spark dataframe for faster prediction
            df_spark = spark.createDataFrame(df) 
        else:
            df_spark = df

        # Annotate dataframe with classification results
        df_spark = self.pipeline_model.transform(df_spark)

        # Visusalize schemas
        # df_spark.printSchema()
        # print(df_spark.select(explode(col("sentiment.result"))).collect()[10])
        # print(df_spark.select(explode(col("sentiment.metadata"))).collect()[0])

        #Extract sentiment score
        if self.MODEL_NAME == "custom_pipeline":
          df_spark_scores = df_spark.select(explode(col("class.metadata")).alias("metadata")).select(col("metadata")["Some(positive)"].alias("positive"),
                                                                                            col("metadata")["Some(neutral)"].alias("neutral"),
                                                                                            col("metadata")["Some(negative)"].alias("negative"))
        elif self.MODEL_NAME == "classifierdl_bertwiki_finance_sentiment_pipeline":
          df_spark_scores = df_spark.select(explode(col("class.metadata")).alias("metadata")).select(col("metadata")["positive"].alias("positive"),
                                                                                            col("metadata")["neutral"].alias("neutral"),
                                                                                            col("metadata")["negative"].alias("negative"))
          
        elif self.MODEL_NAME == "analyze_sentimentdl_use_twitter":
          df_spark_scores = df_spark.select(explode(col("sentiment.metadata")).alias("metadata")).select(col("metadata")["positive"].alias("positive"),
                                                                                                  col("metadata")["negative"].alias("negative"))
        else:
          df_spark_scores = df_spark.select(explode(col("sentiment.metadata")).alias("metadata")).select(col("metadata")["pos"].alias("positive"),
                                                                                            col("metadata")["neg"].alias("negative"))

        
        # Extract only target and label columns
        if self.MODEL_NAME == "custom_pipeline" or self.MODEL_NAME == "classifierdl_bertwiki_finance_sentiment_pipeline":
              df_spark = df_spark.select("text", "True_Sentiment", "class.result")
        else:
              df_spark = df_spark.select("text", "True_Sentiment", "sentiment.result")

        # Rename to result column to Predicted Sentiment
        df_spark = df_spark.withColumnRenamed("result", "Predicted_Sentiment")

        # Convert sentiment from a list to a string
        df_spark = df_spark.withColumn("Predicted_Sentiment", array_join("Predicted_Sentiment", ""))

        # Join the predictions dataframe to the scores dataframe
        # Add temporary column index to join
        w = Window.orderBy(monotonically_increasing_id())
        df_spark_with_index =  df_spark.withColumn("columnindex", row_number().over(w))
        df_spark_scores_with_index =  df_spark_scores.withColumn("columnindex", row_number().over(w))

        # Join the predictions and the scores in one dataframe
        df_spark_with_index = df_spark_with_index.join(df_spark_scores_with_index,
                                df_spark_with_index.columnindex == df_spark_scores_with_index.columnindex,
                                'inner').drop(df_spark_scores_with_index.columnindex)

        # Remove the index column
        df_spark_combined = df_spark_with_index.drop(df_spark_with_index.columnindex)

        # Convert to pandas dataframe for postprocessing (https://towardsdatascience.com/text-classification-in-spark-nlp-with-bert-and-universal-sentence-encoders-e644d618ca32)
        df_pandas_postprocessed = df_spark_combined.toPandas()

        return df_pandas_postprocessed


    def compute_accuracy(self, df_pandas_postprocessed):
        """Computes accuracy by comparing labels of input dataframe.

        Args:
          df_pandas_postprocessed: pandas dataframe containing "True_Sentiment" and "Predicted_Sentiment" columns
        """
    
        from sklearn.metrics import classification_report, accuracy_score

        # Compute the accuracy
        accuracy = accuracy_score(df_pandas_postprocessed["True_Sentiment"], df_pandas_postprocessed["Predicted_Sentiment"])
        accuracy *= 100
        classification_report = classification_report(df_pandas_postprocessed["True_Sentiment"], df_pandas_postprocessed["Predicted_Sentiment"])

        # Alternatively if the input is a postprocessed spark dataframe
        # Compute accuracy by comparing each true label with predicted label
        # accuracy = df_spark.filter(df_spark.Predicted_Sentiment == df_spark.True_Sentiment).count()/ num_sentences

        return accuracy, classification_report

## Test the accuracy of sentiment using the Financial News Headline Dataset

## Sentiment

### Load Sentiment Test data Financial Phrase Bank

In [6]:
# Convert Kaggle data to Pandas dataframe and preprocess
import time

sentiment_url = 'https://raw.githubusercontent.com/Brand-Sentiment-Tracking/python-package/main/data/sentiment_test_data.csv' # Financial Phrase Bank

# Store data in a Pandas Dataframe
df_pandas = pd.read_csv(sentiment_url, header=None)


# Change column names (pipelines require a "text" column to predict)
df_pandas.columns = ['True_Sentiment', 'text']

# Shuffle the DataFrame rows
# df_pandas = df_pandas.sample(frac = 1)

# Make dataset smaller for faster runtime
num_sentences = 4846 # Total is 4846
total_num_sentences = df_pandas.shape[0]
df_pandas.drop(df_pandas.index[num_sentences:total_num_sentences], inplace=True)

print(df_pandas.shape)

# Create a preprocessed spark dataframe
from pyspark import SparkFiles
spark.sparkContext.addFile(sentiment_url)

# Read raw dataframe
df_spark = spark.read.csv("file://"+SparkFiles.get("sentiment_test_data.csv"))

# Rename columns
df_spark = df_spark.withColumnRenamed("_c0", "True_Sentiment").withColumnRenamed("_c1", "text")
df_spark = df_spark.limit(num_sentences)

(4846, 2)


## Read Political News Dataset

In [7]:
# Convert Kaggle data to Pandas dataframe and preprocess
import pandas as pd

sentiment_url_2 = 'https://raw.githubusercontent.com/Brand-Sentiment-Tracking/python-package/main/data/SEN_en_R.csv' # SEN data

# Store data in a Pandas Dataframe
df_pandas_2 = pd.read_csv(sentiment_url_2, header=None, skiprows=1)

df_pandas_2.columns = ['idx', 'text', 'Entity', 'True_Sentiment']

# Modify labels to match positive, neutral, negative
df_pandas_2 = df_pandas_2.replace({'True_Sentiment': {'pos' : 'positive', 'neg' : 'negative', 'neutr' : 'neutral'}})


# Make dataset smaller for faster runtime
num_sentences = 1271 # Total is 4846
total_num_sentences = df_pandas_2.shape[0]
df_pandas_2.drop(df_pandas_2.index[num_sentences:total_num_sentences], inplace=True)

# Filter unk labels
df_pandas_2 = df_pandas_2[df_pandas_2['True_Sentiment']!='unk']


print(df_pandas_2.shape)
display(df_pandas_2)


# Create a preprocessed spark dataframe
from pyspark import SparkFiles
spark.sparkContext.addFile(sentiment_url_2)

# Read raw dataframe
df_spark = spark.read.option("header","true").csv("file://"+SparkFiles.get("SEN_en_R.csv"))

# Rename columns
df_spark = df_spark.withColumnRenamed("_c0", "idx").withColumnRenamed("_c1", "text") \
                                                  .withColumnRenamed("_c2", 'Entity') \
                                                  .withColumnRenamed("_c3", 'True_Sentiment')
df_spark = df_spark.limit(num_sentences)

df_spark.show()


(1237, 4)


Unnamed: 0,idx,text,Entity,True_Sentiment
0,0,Boris Johnson Joins Trump in Redefining Conser...,Trump,neutral
1,1,Trump Praises Controversial Hungarian Leader,Trump,negative
2,2,Stung by Trump’s Criticisms of Russian Gas Dea...,Trump,neutral
3,3,What ‘America First’ Means Under Trump Is Comi...,Trump,neutral
4,4,Trump Calls for NATO Expansion Into Middle East,Trump,positive
...,...,...,...,...
1266,1266,Ivanka Trump was friends with former British s...,Trump,negative
1267,1267,How could 63 million people be wrong? The GOP ...,Trump,negative
1268,1268,What Trump claimed about the Russia probe — an...,Trump,neutral
1269,1269,If officials objecting to Trump’s candidacy wa...,Trump,neutral


+---+--------------------+------+--------------+
|idx|            headline|entity|majority_label|
+---+--------------------+------+--------------+
|  0|Boris Johnson Joi...| Trump|         neutr|
|  1|Trump Praises Con...| Trump|           neg|
|  2|Stung by Trump’s ...| Trump|         neutr|
|  3|What ‘America Fir...| Trump|         neutr|
|  4|Trump Calls for N...| Trump|           pos|
|  5|Trump Set to Meet...| Trump|         neutr|
|  6|Trump Clashes Wit...| Trump|         neutr|
|  7|Donald Trump Won’...| Trump|           pos|
|  8|Trump to Attend N...| Trump|         neutr|
|  9|Trump-Erdogan Rap...| Trump|         neutr|
| 10|Trump Says Erdoga...| Trump|         neutr|
| 11|Ukrainian Preside...| Trump|         neutr|
| 12|In Months Before ...| Trump|           neg|
| 13|Ukraine to Review...| Biden|         neutr|
| 14|For Trump, Long O...| Trump|         neutr|
| 15|Taliban Negotiato...| Trump|           neg|
| 16|Trump Administrat...| Trump|           neg|
| 17|U.S. Sanctions 

### Classify using Pandas Dataframe as input

In [9]:
from pyspark.sql.functions import array_join
from pyspark.sql.functions import col, explode, expr, greatest
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number

# Create identifier
# identifier_pretrained = SentimentIdentification(MODEL_NAME = "classifierdl_bertwiki_finance_sentiment_pipeline")
# identifier_pretrained = SentimentIdentification(MODEL_NAME = "custom_pipeline")
# identifier_pretrained = SentimentIdentification(MODEL_NAME = "untrained_pipeline")
# identifier_pretrained = SentimentIdentification(MODEL_NAME = "analyze_sentimentdl_glove_imdb") 
# identifier_pretrained = SentimentIdentification(MODEL_NAME = "analyze_sentimentdl_use_imdb")
# identifier_pretrained = SentimentIdentification(MODEL_NAME = "analyze_sentiment")
identifier_pretrained = SentimentIdentification(MODEL_NAME = "analyze_sentimentdl_use_twitter")

# Change dataframe
# df_pandas = df_pandas_2

start = time.time()
df_pandas_postprocessed = identifier_pretrained.predict_dataframe(df_pandas)
end = time.time()

print(f"{end-start} seconds elapsed to classify {num_sentences} sentences.")

# Modify predicted labels to match with true labels
df_pandas_postprocessed = df_pandas_postprocessed.replace({'Predicted_Sentiment': {'pos' : 'positive', 'neg' : 'negative'}})

display(df_pandas_postprocessed)

# Print accuracy metrics
accuracy, report = identifier_pretrained.compute_accuracy(df_pandas_postprocessed)
print(accuracy)
print(report)




# Accuracy report for custom pipeline on financial headlines:

# 88.44407758976476
#               precision    recall  f1-score   support

#     negative       0.82      0.96      0.88       604
#      neutral       0.94      0.87      0.90      2879
#     positive       0.81      0.88      0.85      1363

#     accuracy                           0.88      4846
#    macro avg       0.86      0.90      0.88      4846
# weighted avg       0.89      0.88      0.89      4846


# Accuracy report for classifierdl_bertwiki_finance_sentiment_pipeline pipeline on financial headlines:

# 90.09492364836979
#               precision    recall  f1-score   support

#     negative       0.92      0.86      0.89       604
#      neutral       0.91      0.94      0.93      2879
#     positive       0.87      0.83      0.85      1363

#     accuracy                           0.90      4846
#    macro avg       0.90      0.88      0.89      4846
# weighted avg       0.90      0.90      0.90      4846


# Accuracy report for twitter model on financial headlines:
# 28.951712752785802
#               precision    recall  f1-score   support

#     negative       0.25      0.75      0.38       604
#      neutral       0.54      0.03      0.05      2879
#     positive       0.30      0.64      0.41      1363

#     accuracy                           0.29      4846
#    macro avg       0.37      0.47      0.28      4846
# weighted avg       0.44      0.29      0.19      4846

# Accuracy report for bert_base_sequence_classifier_imbd pipeline on political headlines:



# Accuracy report for custom pipeline on political headlines:

# 49.15117219078415
#               precision    recall  f1-score   support

#     negative       0.56      0.19      0.29       449
#      neutral       0.48      0.87      0.62       571
#     positive       0.42      0.11      0.18       217

#     accuracy                           0.49      1237
#    macro avg       0.49      0.39      0.36      1237
# weighted avg       0.50      0.49      0.42      1237

# Accuracy report for classifierdl_bertwiki_finance_sentiment_pipeline pipeline on political headlines:

# 47.696038803556995
#               precision    recall  f1-score   support

#     negative       0.52      0.12      0.20       449
#      neutral       0.48      0.86      0.62       571
#     positive       0.41      0.19      0.26       217

#     accuracy                           0.48      1237
#    macro avg       0.47      0.39      0.36      1237
# weighted avg       0.48      0.48      0.40      1237

# Accuracy report for analyze_sentimentdl_glove_imdb on political headlines

# 37.10590137429264
#               precision    recall  f1-score   support

#     negative       0.46      0.66      0.54       449
#      neutral       0.52      0.03      0.05       571
#     positive       0.26      0.67      0.38       217

#     accuracy                           0.37      1237
#    macro avg       0.41      0.45      0.32      1237
# weighted avg       0.45      0.37      0.29      1237

# Accuracy report for analyze_sentimentdl_use_imdb on political headlines

# 29.426030719482622
#               precision    recall  f1-score   support

#     negative       0.44      0.44      0.44       449
#      neutral       0.62      0.02      0.03       571
#     positive       0.20      0.72      0.32       217

#     accuracy                           0.29      1237
#    macro avg       0.42      0.39      0.26      1237
# weighted avg       0.48      0.29      0.23      1237

# Accuracy report for analyze_sentiment

# 28.37510105092967
#                                   precision    recall  f1-score   support

#                               na       0.00      0.00      0.00         0
#                         negative       0.38      0.59      0.46       449
#                 negativenegative       0.00      0.00      0.00         0
#         negativenegativenegative       0.00      0.00      0.00         0
#                 negativepositive       0.00      0.00      0.00         0
#         negativepositivenegative       0.00      0.00      0.00         0
# negativepositivenegativenegative       0.00      0.00      0.00         0
#         negativepositivepositive       0.00      0.00      0.00         0
#                          neutral       0.00      0.00      0.00       571
#                         positive       0.21      0.40      0.27       217
#                       positivena       0.00      0.00      0.00         0
#                 positivenegative       0.00      0.00      0.00         0
#                 positivepositive       0.00      0.00      0.00         0
#         positivepositivepositive       0.00      0.00      0.00         0

#                         accuracy                           0.28      1237
#                        macro avg       0.04      0.07      0.05      1237
#                     weighted avg       0.17      0.28      0.21      1237


# Accuracy report for analyze_sentimentdl_use_twitter

# 35.89329021827001
#               precision    recall  f1-score   support

#     negative       0.43      0.67      0.53       449
#      neutral       0.49      0.04      0.07       571
#     positive       0.24      0.55      0.34       217

#     accuracy                           0.36      1237
#    macro avg       0.39      0.42      0.31      1237
# weighted avg       0.42      0.36      0.28      1237

analyze_sentimentdl_use_twitter download started this may take some time.
Approx size to download 935.1 MB
[OK!]
74.93413925170898 seconds elapsed to classify 1271 sentences.


Unnamed: 0,text,True_Sentiment,Predicted_Sentiment,positive,negative
0,"According to Gran , the company has no plans t...",neutral,negative,0.049131747,0.95086825
1,Technopolis plans to develop in stages an area...,neutral,positive,0.9986187,0.0013813168
2,The international electronic industry company ...,negative,negative,9.0321495E-15,1.0
3,With the new production plant the company woul...,positive,positive,0.9408187,0.059181254
4,According to the company 's updated strategy f...,positive,positive,1.0,5.2342273E-9
...,...,...,...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,negative,negative,0.0,1.0
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,neutral,negative,0.18757151,0.81242853
4843,Operating profit fell to EUR 35.4 mn from EUR ...,negative,negative,1.7573632E-18,1.0
4844,Net sales of the Paper segment decreased to EU...,negative,negative,0.1898243,0.8101757


28.951712752785802
              precision    recall  f1-score   support

    negative       0.25      0.75      0.38       604
     neutral       0.54      0.03      0.05      2879
    positive       0.30      0.64      0.41      1363

    accuracy                           0.29      4846
   macro avg       0.37      0.47      0.28      4846
weighted avg       0.44      0.29      0.19      4846



# Predict using Spark Dataframe Input

In [10]:
# Create identifier
# identifier_pretrained = SentimentIdentification(MODEL_NAME = "classifierdl_bertwiki_finance_sentiment_pipeline")
identifier_pretrained = SentimentIdentification(MODEL_NAME = "custom_pipeline")

start = time.time()
# df_pandas_postprocessed = identifier_pretrained.predict_sp_dataframe(df_spark)
df_pandas_postprocessed = identifier_pretrained.predict_dataframe(df_spark)
end = time.time()

print(f"{end-start} seconds elapsed to classify {num_sentences} sentences.")

display(df_pandas_postprocessed)

bert_base_sequence_classifier_imdb download started this may take some time.
Approximate size to download 387.6 MB
[OK!]


IllegalArgumentException: ignored

### Identify the sentiment in each sentence one by one

In [10]:
# Create the identifier object
# identifier = SentimentIdentification(MODEL_NAME = "custom_pipeline") # 90.2% accuracy on 500 sentences 89.8% on 1000 sentences
# identifier = SentimentIdentification(MODEL_NAME =  "classifierdl_bertwiki_finance_sentiment_pipeline") # Alternative pretrained pipeline 90.0% accuracy on 500 sentences
# identifier = SentimentIdentification(MODEL_NAME =  "classifierdl_bertwiki_finance_sentiment_pipeline") # Alternative pretrained pipeline 90.0% accuracy on 500 sentences
# identifier = SentimentIdentification(MODEL_NAME = "analyze_sentimentdl_glove_imdb") 
# identifier = SentimentIdentification(MODEL_NAME = "analyze_sentimentdl_use_imdb")
# identifier = SentimentIdentification(MODEL_NAME = "analyze_sentiment")
identifier = SentimentIdentification(MODEL_NAME = "analyze_sentimentdl_use_twitter")

df_pandas = df_pandas_2

# sentence = ['Trump Clashes With Macron on NATO, Trade and Islamic State']
# print(identifier.predict_string_list(sentence))

preds = []
target = []
ignored_idxs = []
sentiment_to_ignore = "neutral" # e.g. neutral

# Measure how long it takes
start = time.time()

# Collect predicted sentiment for each headline - take three minutes to run
for idx, hl in enumerate(df_pandas['text']):

    # Only append the sentiment if it is not the sentiment to ignore (e.g. neutral)
    target_sentiment = df_pandas["True_Sentiment"][df_pandas.index[idx]]

    if target_sentiment != sentiment_to_ignore:
      preds.append(identifier.predict_string_list([hl])[0])
    else:
      ignored_idxs.append(idx)

    # Print progress
    if idx % 50 == 0:
      print(f"Classification {100*idx/num_sentences}% done.")

# Remove all ignored entries from dataset
df_pandas_postprocessed = df_pandas.drop(df_pandas.index[ignored_idxs], inplace=False)

df_pandas_postprocessed['Predicted_Sentiment'] = preds

# Measure how long it takes
end = time.time()
print(f"{end-start} seconds elapsed to classify {num_sentences} sentences.")

# Modify predicted labels to match with true labels
df_pandas_postprocessed = df_pandas_postprocessed.replace({'Predicted_Sentiment': {'pos' : 'positive', 'neg' : 'negative'}})


accuracy, report = identifier.compute_accuracy(df_pandas_postprocessed)
print(accuracy)
print(report)

# Accuracy report for analyze_sentimentdl_glove_imdb

# 61.0574478901881
#               precision    recall  f1-score   support

#     negative       0.44      0.80      0.57       604
#      neutral       0.00      0.00      0.00         0
#     positive       0.87      0.52      0.66      1363

#     accuracy                           0.61      1967
#    macro avg       0.44      0.44      0.41      1967
# weighted avg       0.74      0.61      0.63      1967


# Accuracy report for analyze_sentimentdl_use_imdb

# 71.42857142857143
#               precision    recall  f1-score   support

#     negative       0.61      0.22      0.32       604
#      neutral       0.00      0.00      0.00         0
#     positive       0.73      0.93      0.82      1363

#     accuracy                           0.71      1967
#    macro avg       0.45      0.38      0.38      1967
# weighted avg       0.69      0.71      0.67      1967


# Accuracy report for analyze_sentiment

# 44.941535332994405
#               precision    recall  f1-score   support

#     negative       0.31      0.63      0.41       604
#     positive       0.69      0.37      0.48      1363

#     accuracy                           0.45      1967
#    macro avg       0.50      0.50      0.45      1967
# weighted avg       0.57      0.45      0.46      1967

# Accuracy report for twitter pipeline on financial data:
# 67.56481952211489
#               precision    recall  f1-score   support

#     negative       0.51      0.75      0.61       604
#      neutral       0.00      0.00      0.00         0
#     positive       0.87      0.64      0.74      1363

#     accuracy                           0.68      1967
#    macro avg       0.46      0.46      0.45      1967
# weighted avg       0.76      0.68      0.70      1967


# Accuracy report for twitter pipeline on political data: 

# 63.36336336336337
#               precision    recall  f1-score   support

#     negative       0.77      0.67      0.72       449
#      neutral       0.00      0.00      0.00         0
#     positive       0.48      0.55      0.52       217

#     accuracy                           0.63       666
#    macro avg       0.42      0.41      0.41       666
# weighted avg       0.67      0.63      0.65       666

# Accuracy report for analyze_sentimentdl_glove_imdb on political data: 

# 66.66666666666666
#               precision    recall  f1-score   support

#     negative       0.81      0.66      0.73       449
#      neutral       0.00      0.00      0.00         0
#     positive       0.51      0.67      0.58       217

#     accuracy                           0.67       666
#    macro avg       0.44      0.45      0.44       666
# weighted avg       0.71      0.67      0.68       666

# Accuracy report for analyze_sentimentdl_use_imdb on political data: 

# 53.153153153153156
#               precision    recall  f1-score   support

#     negative       0.76      0.44      0.56       449
#      neutral       0.00      0.00      0.00         0
#     positive       0.39      0.72      0.50       217

#     accuracy                           0.53       666
#    macro avg       0.38      0.39      0.35       666
# weighted avg       0.64      0.53      0.54       666

analyze_sentimentdl_use_twitter download started this may take some time.
Approx size to download 935.1 MB
[OK!]
Classification 0.0% done.
Classification 3.933910306845004% done.
Classification 7.867820613690008% done.
Classification 11.801730920535011% done.
Classification 15.735641227380016% done.
Classification 19.66955153422502% done.
Classification 23.603461841070022% done.
Classification 27.53737214791503% done.
Classification 31.471282454760033% done.
Classification 35.405192761605036% done.
Classification 39.33910306845004% done.
Classification 43.273013375295044% done.
Classification 47.206923682140044% done.
Classification 51.14083398898505% done.
Classification 55.07474429583006% done.
Classification 59.00865460267506% done.
Classification 62.942564909520065% done.
Classification 66.87647521636507% done.
Classification 70.81038552321007% done.
Classification 74.74429583005508% done.
Classification 78.67820613690007% done.
Classification 82.61211644374508% done.
Classificatio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
df_pandas_postprocessed

### Measure the Accuracy

In [None]:
from sklearn.metrics import classification_report

y_true = df_pandas_postprocessed['True_Sentiment'].to_numpy()
y_pred = df_pandas_postprocessed['Predicted_Sentiment'].to_numpy()


print(f"The accuracy is {100* sum(y_true==y_pred)/len(y_true)}%. \n")

target_names = ['positive', 'neutral', 'negative']

# Compute classification metrics - poor accuracy
print(classification_report(y_true, y_pred))#, target_names=target_names))