# 0. Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import warnings

# Filter out the specific UserWarnings
warnings.filterwarnings("ignore", category=UserWarning, message="A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy")
warnings.filterwarnings("ignore", category=UserWarning, message="unable to load libtensorflow_io_plugins.so")
warnings.filterwarnings("ignore", category=UserWarning, message="file system plugins are not loaded")

In [3]:
# Hugging Face library
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification

In [4]:
# Hugging Face library
from datasets import Dataset, DatasetDict

In [5]:
# Accuracy metrics and model selection from Scikit-Learn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

In [6]:
import nltk
import spacy
import re

# 1. Load Dataset

In [7]:
# Create a function to import the data from xlxs format

def load_data(file_path):
    return pd.read_excel(file_path, header=None,names = ['ID','DATE','CHANNEL','text','sentiment','emotion','info'], skiprows=1)

df_path = '/kaggle/input/peaks-sentiment-emotion/Peaks_sentiment_emotion.xlsx'

df = load_data(df_path)

In [8]:
pd.set_option('display.max_colwidth', 150)
df.head()

Unnamed: 0,ID,DATE,CHANNEL,text,sentiment,emotion,info
0,cdec5aa6fae29c87d2f736322cfe70c2,2020-06-11T00:00:00Z,FEED,"Ministro Speranza: ""l'alienazione parentale è un problema relazionale Genitore-figlio come come ci Ministro della Salute Roberto Speranza.X CLOSEM...",-,-,-
1,fc3f6327b8e2840a1bf798e5c89ddf93,2020-06-11T00:00:00Z,FEED,"""Esce con l'amante"". Uccide l'ex moglie ma lei andava a curarsi - IlGiornale.it ""Esce con l'amante"". Uccide l'ex moglie ma lei andava a curarsiPri...",-,-,-
2,d4d2b0342f900b9092651d5662fdabe6,2020-06-11T00:00:00Z,FEED,"Civita di Bagnoregio, ingresso gratuito per medici e infermieri | Viterbo Post Home Tempo libero Civita di Bagnoregio, ingresso gratuito per medic...",-,-,-
3,c57ccff390c3035163513431b48f1c7b,2020-06-11T00:00:00Z,FEED,"PIPER CLUB apre virtualmente con Mix Video Show – Dea Notizie 11 Giugno 2020Non la classica diretta improntata su un singolo artista, quella ideat...",-,-,-
4,d4d71c15c8056f482ad54743842005a6,2020-06-11T00:00:00Z,FEED,"Cei: Omofobia, non serve una legge . Zan: Critiche a un testo su cui stiamo ancora lavorando - GAYNEWS Dopo l'intervento di ieri del vescovo di...",-,-,-


In [9]:
df = df[~((df['sentiment'] == 'UNPREDICTABLE') | (df['emotion'] == 'UNPREDICTABLE'))]

df = df[(df['sentiment'] != ' - ') & (df['emotion'] != ' - ') & (df['info'] != ' - ')]

df.reset_index(drop=True, inplace=True)

In [10]:
# Remove rows with NaN values
df = df.dropna()

In [11]:
# Check whether there are duplicates in the dataframe

print(df.shape)

duplicates = df.duplicated()

duplicate_count = duplicates.value_counts()
print(duplicate_count)

duplicate_lines = df[duplicates].index
for line in duplicate_lines:
    print(f"Duplicate row at line {line}:")
    print(df.loc[line])

df = df.drop_duplicates()

print(df.shape)

(26687, 7)
False    26687
Name: count, dtype: int64
(26687, 7)


In [12]:
df.head()

Unnamed: 0,ID,DATE,CHANNEL,text,sentiment,emotion,info
0,2329826420178307312,1591956974000,INSTAGRAM,"VIERNES A LAS 18:30La esperada desescalada del confinamiento por fin está llegando, probablemente la mayoría de las personas no pensaron que el es...",NEG,TRISTEZZA,UNPREDICTABLE
1,2330062820655731480,1591985155000,INSTAGRAM,Worth the rain #fishing #carpfishing #carp #stalking #parklakes #parklakefishing #urbanfishing #angling #commoncarp,NEU,RABBIA,INFO
2,2329983078223164538,1591975649000,INSTAGRAM,#esprit migrateur #chevreuils #eatmeat #chasse #chassejusquaubout #hunt #hunter #hawke #bbq #approche #stalking #jaitoutmangé,POS,RABBIA,INFO
3,2329949009885948705,1591971587000,INSTAGRAM,Morning briefing then on the bus! #fieldsportsphotographer #fieldsportsphotography #fieldsports #inthefield #shootingtimes #shooting #hunting #sta...,POS,RABBIA,INFO
4,2330154666368548671,1591996104000,INSTAGRAM,-sii sempre te stessa! 🌊✨.....ᴄᴏᴍᴇ ᴀᴠᴇʀᴇ ʟᴀ ᴠᴏꜱᴛʀᴀ ᴅᴇᴅɪᴄᴀ ᴘᴇʀꜱᴏɴᴀʟɪᴢᴢᴀᴛᴀ:1.seguite questa page2.scriveteci in direct il tipo di dedica che desider...,POS,GIOIA,NOINFO


## 1.1 Distribution of the main features (EMOTION, SENTIMENT, INFO)

In [13]:
emotion_count = (df['emotion'].value_counts())
print(emotion_count)

emotion
NEUTRA       9296
RABBIA       7097
TRISTEZZA    6372
GIOIA        1337
SORPRESA     1335
AMORE        1184
PAURA          66
Name: count, dtype: int64


In [14]:
sentiment_count = (df['sentiment'].value_counts())
print(sentiment_count)

sentiment
NEG    12552
POS     7756
NEU     6379
Name: count, dtype: int64


In [15]:
info_count = (df['info'].value_counts(normalize=True)*100).round(2)
print(info_count)

info
NOINFO           47.94
INFO             42.32
UNPREDICTABLE     9.74
Name: proportion, dtype: float64


## 1.3 Train, Validation, Test split

In [16]:
df_train, df_testval = train_test_split(df, test_size=0.25)

df_test, df_val = train_test_split(df_testval, test_size=0.2)

In [17]:
print("Dimension of train dataset is:", df_train.shape)
print("Dimension of validation dataset is:", df_val.shape)
print("Dimension of test dataset is:", df_test.shape)

Dimension of train dataset is: (20015, 7)
Dimension of validation dataset is: (1335, 7)
Dimension of test dataset is: (5337, 7)


In [18]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
val_dataset = Dataset.from_pandas(df_val)

# Create the DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset, 'validation': val_dataset})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'DATE', 'CHANNEL', 'text', 'sentiment', 'emotion', 'info', '__index_level_0__'],
        num_rows: 20015
    })
    test: Dataset({
        features: ['ID', 'DATE', 'CHANNEL', 'text', 'sentiment', 'emotion', 'info', '__index_level_0__'],
        num_rows: 5337
    })
    validation: Dataset({
        features: ['ID', 'DATE', 'CHANNEL', 'text', 'sentiment', 'emotion', 'info', '__index_level_0__'],
        num_rows: 1335
    })
})


# 2. Spacy Preprocessing

In [19]:
!python -m spacy download it_core_news_lg

Collecting it-core-news-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.6.0/it_core_news_lg-3.6.0-py3-none-any.whl (567.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: it-core-news-lg
Successfully installed it-core-news-lg-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')


In [20]:
# Load the Italian language model
nlp = spacy.load('it_core_news_lg')
italian_stopwords = nlp.Defaults.stop_words


# Define a function to preprocess text
def preprocess_text(text):
    # Remove punctuation, URLs, and user mentions
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    
    # Analyze the text to create a Doc object
    doc = nlp(text)
    
    # Prerocess the text
    text = [token.lemma_ for token in doc if token.lemma_ not in italian_stopwords]
    
    return text



def preprocess_dataset(dataset):
    dataset['text'] = preprocess_text(dataset['text'])
    return dataset

dataset = dataset.map(preprocess_dataset)

  0%|          | 0/20015 [00:00<?, ?ex/s]

  0%|          | 0/5337 [00:00<?, ?ex/s]

  0%|          | 0/1335 [00:00<?, ?ex/s]

# 3. Feature Extraction

## 3.1 TF-IDF

In [21]:
X_train = np.array(dataset['train']['text'])

Y_train_emotion = np.array(dataset['train']['emotion'])
Y_train_sentiment = np.array(dataset['train']['sentiment'])
Y_train_info = np.array(dataset['train']['info'])



X_test = np.array(dataset['test']['text'])

Y_test_emotion = np.array(dataset['test']['emotion'])
Y_test_sentiment = np.array(dataset['test']['sentiment'])
Y_test_info = np.array(dataset['test']['info'])

  X_train = np.array(dataset['train']['text'])
  X_test = np.array(dataset['test']['text'])


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Flatten the list of sentences into a list of strings
X_train_flat = [' '.join(sentence) for sentence in X_train]
X_test_flat = [' '.join(sentence) for sentence in X_test]

vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train_flat)
tf_x_test = vectorizer.transform(X_test_flat)

# 4. Support Vector Machine

In [23]:
# LinearSVC
from sklearn.svm import SVC
svm_emotion = SVC(random_state=0)
svm_sentiment = SVC(random_state=0)
svm_info = SVC(random_state=0)

In [24]:
svm_emotion.fit(tf_x_train,Y_train_emotion)
y_test_svm_emotion=svm_emotion.predict(tf_x_test)

svm_sentiment.fit(tf_x_train,Y_train_sentiment)
y_test_svm_sentiment=svm_sentiment.predict(tf_x_test)

svm_info.fit(tf_x_train,Y_train_info)
y_test_svm_info=svm_info.predict(tf_x_test)

# 5. Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB
nb_emotion = MultinomialNB()
nb_sentiment = MultinomialNB()
nb_info = MultinomialNB()

In [26]:
nb_emotion.fit(tf_x_train,Y_train_emotion)
y_test_nb_emotion=nb_emotion.predict(tf_x_test)

nb_sentiment.fit(tf_x_train,Y_train_sentiment)
y_test_nb_sentiment=nb_sentiment.predict(tf_x_test)

nb_info.fit(tf_x_train,Y_train_info)
y_test_nb_info=nb_info.predict(tf_x_test)

# 6. Metrics

## 6.0 Classification Report

In [27]:
report_emotion_svm = classification_report(Y_test_emotion, y_test_svm_emotion)
report_emotion_nb = classification_report(Y_test_emotion, y_test_nb_emotion)

# Print the reports
print("Emotion Classification Report SVM:")
print(report_emotion_svm)

print("\nEmotion Classification Report Naive Bayes:")
print(report_emotion_nb)

Emotion Classification Report SVM:
              precision    recall  f1-score   support

       AMORE       0.94      0.75      0.84       220
       GIOIA       0.82      0.57      0.67       267
      NEUTRA       0.77      0.95      0.85      1797
       PAURA       1.00      0.27      0.43        11
      RABBIA       0.97      0.90      0.93      1469
    SORPRESA       0.95      0.64      0.76       265
   TRISTEZZA       0.96      0.87      0.91      1308

    accuracy                           0.87      5337
   macro avg       0.91      0.71      0.77      5337
weighted avg       0.89      0.87      0.87      5337


Emotion Classification Report Naive Bayes:
              precision    recall  f1-score   support

       AMORE       0.97      0.67      0.79       220
       GIOIA       0.74      0.47      0.57       267
      NEUTRA       0.73      0.93      0.82      1797
       PAURA       0.00      0.00      0.00        11
      RABBIA       0.97      0.87      0.92      1469

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
report_sentiment_svm = classification_report(Y_test_sentiment, y_test_svm_sentiment)
report_sentiment_nb = classification_report(Y_test_sentiment, y_test_nb_sentiment)

# Print the reports
print("Sentiment Classification Report SVM:")
print(report_sentiment_svm)

print("\nSentiment Classification Report Naive Bayes:")
print(report_sentiment_nb)

Sentiment Classification Report SVM:
              precision    recall  f1-score   support

         NEG       0.97      0.93      0.95      2591
         NEU       0.81      0.76      0.78      1250
         POS       0.77      0.87      0.81      1496

    accuracy                           0.87      5337
   macro avg       0.85      0.85      0.85      5337
weighted avg       0.88      0.87      0.87      5337


Sentiment Classification Report Naive Bayes:
              precision    recall  f1-score   support

         NEG       0.88      0.94      0.91      2591
         NEU       0.79      0.68      0.73      1250
         POS       0.80      0.81      0.80      1496

    accuracy                           0.84      5337
   macro avg       0.82      0.81      0.81      5337
weighted avg       0.84      0.84      0.84      5337



In [42]:
report_info_svm = classification_report(Y_test_info, y_test_svm_info)
report_info_nb = classification_report(Y_test_info, y_test_nb_info)

# Print the reports
print("Info Classification Report SVM:")
print(report_info_svm)

print("\nInfo Classification Report Naive Bayes:")
print(report_info_nb)

Info Classification Report SVM:
               precision    recall  f1-score   support

         INFO       0.92      0.89      0.90      2305
       NOINFO       0.84      0.95      0.89      2542
UNPREDICTABLE       0.86      0.42      0.57       490

     accuracy                           0.87      5337
    macro avg       0.87      0.75      0.79      5337
 weighted avg       0.88      0.87      0.87      5337


Info Classification Report Naive Bayes:
               precision    recall  f1-score   support

         INFO       0.92      0.88      0.90      2305
       NOINFO       0.83      0.95      0.89      2542
UNPREDICTABLE       0.91      0.37      0.53       490

     accuracy                           0.87      5337
    macro avg       0.89      0.73      0.77      5337
 weighted avg       0.88      0.87      0.86      5337



## 6.1 Emotion

In [30]:
accuracy_svm_emotion = accuracy_score(Y_test_emotion, y_test_svm_emotion) # (TP+TN)/P+N i.e total number of corrected classified tweet over total number of tweets

accuracy_nb_emotion = accuracy_score(Y_test_emotion, y_test_nb_emotion)

print("Support Vector Machine accuracy:", accuracy_svm_emotion)
print("Naive Bayes accuracy:", accuracy_nb_emotion)

Support Vector Machine accuracy: 0.8744613078508525
Naive Bayes accuracy: 0.8388607832115421


In [31]:
precision_svm_emotion = precision_score(Y_test_emotion, y_test_svm_emotion,average=None, labels=['TRISTEZZA','GIOIA','AMORE','RABBIA','PAURA','SORPRESA','NEUTRA']) # TP/(TP+FP) i.e if predicted a certain class, which is the probability of being really that class?

precision_nb_emotion = precision_score(Y_test_emotion, y_test_nb_emotion,average=None, labels=['TRISTEZZA','GIOIA','AMORE','RABBIA','PAURA','SORPRESA','NEUTRA'])

print("Support Vector Machine precision:", precision_svm_emotion)
print("Naive Bayes precision:", precision_nb_emotion)

Support Vector Machine precision: [0.95568562 0.81818182 0.93785311 0.96985294 1.         0.9494382
 0.76654741]
Naive Bayes precision: [0.89494787 0.74404762 0.96710526 0.9667171  0.         0.99242424
 0.7253886 ]


  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
recall_svm_emotion = recall_score(Y_test_emotion, y_test_svm_emotion,average=None, labels=['TRISTEZZA','GIOIA','AMORE','RABBIA','PAURA','SORPRESA','NEUTRA']) # TP/(TP+FN) i.e the ability of the estimator to predict all the tweets of a given class

recall_nb_emotion = recall_score(Y_test_emotion, y_test_nb_emotion,average=None, labels=['TRISTEZZA','GIOIA','AMORE','RABBIA','PAURA','SORPRESA','NEUTRA'])


print("Support Vector Machine recall:", recall_svm_emotion)
print("Naive Bayes recall:", recall_nb_emotion)

Support Vector Machine recall: [0.87385321 0.57303371 0.75454545 0.89788972 0.27272727 0.63773585
 0.95381191]
Naive Bayes recall: [0.85321101 0.46816479 0.66818182 0.86997958 0.         0.49433962
 0.93489149]


In [33]:
f1score_svm_emotion = f1_score(Y_test_emotion, y_test_svm_emotion,average=None, labels=['TRISTEZZA','GIOIA','AMORE','RABBIA','PAURA','SORPRESA','NEUTRA']) # 2*(precision*recall)/(precision+recall)

f1score_nb_emotion = f1_score(Y_test_emotion, y_test_nb_emotion,average=None, labels=['TRISTEZZA','GIOIA','AMORE','RABBIA','PAURA','SORPRESA','NEUTRA'])


print("Support Vector Machine f1-score:", f1score_svm_emotion)
print("Naive Bayes f1-score:", f1score_nb_emotion)

Support Vector Machine f1-score: [0.9129393  0.67400881 0.83627204 0.93248498 0.42857143 0.76297968
 0.8499876 ]
Naive Bayes f1-score: [0.87358121 0.57471264 0.79032258 0.91580079 0.         0.65994962
 0.81692195]


## 6.2 Sentiment

In [34]:
accuracy_svm_sentiment = accuracy_score(Y_test_sentiment, y_test_svm_sentiment) 

accuracy_nb_sentiment = accuracy_score(Y_test_sentiment, y_test_nb_sentiment)

print("Support Vector Machine accuracy:", accuracy_svm_sentiment)
print("Naive Bayes accuracy:", accuracy_nb_sentiment)

Support Vector Machine accuracy: 0.8697770282930485
Naive Bayes accuracy: 0.8397976391231029


In [35]:
precision_svm_sentiment = precision_score(Y_test_sentiment, y_test_svm_sentiment,average=None, labels=['NEG','NEU','POS']) 

precision_nb_sentiment = precision_score(Y_test_sentiment, y_test_nb_sentiment,average=None, labels=['NEG','NEU','POS'])

print("Support Vector Machine precision:", precision_svm_sentiment)
print("Naive Bayes precision:", precision_nb_sentiment)

Support Vector Machine precision: [0.9704573  0.8087105  0.76519174]
Naive Bayes precision: [0.88239564 0.79248826 0.79564931]


In [36]:
recall_svm_sentiment = recall_score(Y_test_sentiment, y_test_svm_sentiment,average=None, labels=['NEG','NEU','POS'])

recall_nb_sentiment = recall_score(Y_test_sentiment, y_test_nb_sentiment,average=None, labels=['NEG','NEU','POS'])

print("Support Vector Machine recall:", recall_svm_sentiment)
print("Naive Bayes recall:", recall_nb_sentiment)

Support Vector Machine recall: [0.92551139 0.7576     0.86697861]
Naive Bayes recall: [0.93824778 0.6752     0.80681818]


In [37]:
f1score_svm_sentiment = f1_score(Y_test_sentiment, y_test_svm_sentiment,average=None, labels=['NEG','NEU','POS']) # 2*(precision*recall)/(precision+recall)

f1score_nb_sentiment = f1_score(Y_test_sentiment, y_test_nb_sentiment,average=None, labels=['NEG','NEU','POS'])


print("Support Vector Machine f1-score:", f1score_svm_sentiment)
print("Naive Bayes f1-score:", f1score_nb_sentiment)

Support Vector Machine f1-score: [0.9474516  0.78232135 0.81291131]
Naive Bayes f1-score: [0.90946502 0.72915767 0.80119482]


## 6.3 Info

In [38]:
accuracy_svm_info = accuracy_score(Y_test_info, y_test_svm_info)

accuracy_nb_info = accuracy_score(Y_test_info, y_test_nb_info)

print("Support Vector Machine accuracy:", accuracy_svm_info)
print("Naive Bayes accuracy:", accuracy_nb_info)

Support Vector Machine accuracy: 0.8742739366685404
Naive Bayes accuracy: 0.8690275435637999


In [39]:
precision_svm_info = precision_score(Y_test_info, y_test_svm_info,average=None, labels=['INFO','NOINFO','UNPREDICTABLE']) 

precision_nb_info = precision_score(Y_test_info, y_test_nb_info,average=None, labels=['INFO','NOINFO','UNPREDICTABLE'])

print("Support Vector Machine precision:", precision_svm_info)
print("Naive Bayes precision:", precision_nb_info)

Support Vector Machine precision: [0.92110009 0.83976364 0.8553719 ]
Naive Bayes precision: [0.91565178 0.83059548 0.91414141]


In [40]:
recall_svm_info = recall_score(Y_test_info, y_test_svm_info,average=None, labels=['INFO','NOINFO','UNPREDICTABLE']) 

recall_nb_info = recall_score(Y_test_info, y_test_nb_info,average=None, labels=['INFO','NOINFO','UNPREDICTABLE'])

print("Support Vector Machine recall:", recall_svm_info)
print("Naive Bayes recall:", recall_nb_info)

Support Vector Machine recall: [0.88633406 0.95043273 0.42244898]
Naive Bayes recall: [0.88069414 0.95476003 0.36938776]


In [41]:
f1score_svm_info = f1_score(Y_test_info, y_test_svm_info,average=None, labels=['INFO','NOINFO','UNPREDICTABLE']) # 2*(precision*recall)/(precision+recall)

f1score_nb_info = f1_score(Y_test_info, y_test_nb_info,average=None, labels=['INFO','NOINFO','UNPREDICTABLE'])


print("Support Vector Machine f1-score:", f1score_svm_info)
print("Naive Bayes f1-score:", f1score_nb_info)

Support Vector Machine f1-score: [0.90338271 0.89167743 0.56557377]
Naive Bayes f1-score: [0.89783282 0.88836018 0.52616279]
