In [1]:
#Importing packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import dill

In [2]:
#Loading the dataset
with open("df.pkl", "rb") as file:
    df = dill.load(file)
df.head()

Unnamed: 0,label,text,polarity,lemma_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",-0.044875,@switchfoot http://twitpic.com/2y1zl Awww bumm...
1,0,is upset that he can't update his Facebook by ...,-0.20688,upset update Facebook texte cry result Schoo...
2,0,@Kenichan I dived many times for the ball. Man...,0.24695,@Kenichan dive time ball manage save 50 rest...
3,0,my whole body feels itchy and like its on fire,-0.25,body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",-0.123475,@nationwideclass behave mad


In [3]:
#Importing azure credentials and creating a text analytics client instance
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
import cred

credential = AzureKeyCredential(cred.cognitive_credential)
endpoint=cred.cognitive_endpoint

text_analytics_client = TextAnalyticsClient(endpoint, credential)

text_analytics_client

<azure.ai.textanalytics._text_analytics_client.TextAnalyticsClient at 0x7f686490e650>

In [15]:
#Calculating the sentiment of a sample of 5 reviews
sample_df = df.head(5)

documents = sample_df["text"]

response = text_analytics_client.analyze_sentiment(documents.to_list(), language="en")
result = [doc for doc in response]

for doc in result:
    print(f"Overall sentiment: {doc.sentiment}")
    print(
        f"Scores: positive={doc.confidence_scores.positive}; "
        f"neutral={doc.confidence_scores.neutral}; "
        f"negative={doc.confidence_scores.negative}\n"
    )

Overall sentiment: negative
Scores: positive=0.0; neutral=0.0; negative=1.0

Overall sentiment: negative
Scores: positive=0.04; neutral=0.18; negative=0.78

Overall sentiment: positive
Scores: positive=0.98; neutral=0.01; negative=0.01

Overall sentiment: neutral
Scores: positive=0.05; neutral=0.6; negative=0.34

Overall sentiment: negative
Scores: positive=0.0; neutral=0.02; negative=0.98



In [16]:
sample_df.head()

Unnamed: 0,label,text,polarity,lemma_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",-0.044875,@switchfoot http://twitpic.com/2y1zl Awww bumm...
1,0,is upset that he can't update his Facebook by ...,-0.20688,upset update Facebook texte cry result Schoo...
2,0,@Kenichan I dived many times for the ball. Man...,0.24695,@Kenichan dive time ball manage save 50 rest...
3,0,my whole body feels itchy and like its on fire,-0.25,body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",-0.123475,@nationwideclass behave mad


In [18]:
label = [1 if doc.confidence_scores.positive > doc.confidence_scores.negative else 0 for doc in result]

label

[0, 0, 1, 0, 0]

In [14]:
#Splitting our dataset and keeping only 500 rows in the test set
from sklearn.model_selection import train_test_split
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y , stratify = y, shuffle=True, test_size = 500)

sample_df = pd.DataFrame({'label': y_test, 'text': X_test})

sample_df

Unnamed: 0,label,text
1549331,1,@zaftigvegan I never picked one up and I am Te...
1100380,1,@AstridNicole You know someone on Twitter can ...
654250,0,@Slic_Vic up under the dryer the second time a...
1411213,1,I'm still tired. I think i might go take a nap
105027,0,I think I have ran outta music to play lmao ....
...,...,...
731181,0,@Hadassah_Levy Wish I could have been your he...
1214486,1,HAPPY CHILDREN'S DAY
88650,0,Sad to be stuck in work on such a nice day!
1195649,1,Excited to see the New Moon trailer!! I love ...


In [15]:
#Calculating our labels for the first 500 rows of our dataset
results = []
labels=[]

for i in range(1,51,1):
    k = 10*(i-1)
    l = 10*i
    subset = sample_df[k:l]
    
    documents = subset["text"]

    response = text_analytics_client.analyze_sentiment(documents.to_list(), language="en")
    result = [doc for doc in response]

    results.append(result)

    azure_label = [1 if doc.confidence_scores.positive > doc.confidence_scores.negative else 0 for doc in result]

    labels.append(azure_label)

In [16]:
flat_labels = list(np.concatenate(labels).flat)

sample_df["azure_label"] = flat_labels

sample_df.head()

Unnamed: 0,label,text,azure_label
1549331,1,@zaftigvegan I never picked one up and I am Te...,1
1100380,1,@AstridNicole You know someone on Twitter can ...,1
654250,0,@Slic_Vic up under the dryer the second time a...,0
1411213,1,I'm still tired. I think i might go take a nap,0
105027,0,I think I have ran outta music to play lmao ....,1


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

accuracy = accuracy_score(sample_df["label"], sample_df["azure_label"])
roc = roc_auc_score(sample_df["label"], sample_df["azure_label"])
f1 = f1_score(sample_df["label"], sample_df["azure_label"])
precision = precision_score(sample_df["label"], sample_df["azure_label"])
recall = accuracy_score(sample_df["label"], sample_df["azure_label"])

print("Accuracy : ", accuracy)
print("ROC AUC SCORE : ", roc)
print("F1 Score : ", f1)
print("Precision : ", precision)
print("Recall : ", recall)

Accuracy :  0.758
ROC AUC SCORE :  0.7579999999999999
F1 Score :  0.7771639042357273
Precision :  0.7201365187713311
Recall :  0.758


We have reached a **good accuracy at 0.76** with this API, although it does not surpass our BERT models.