# 0. Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import warnings

# Filter out the specific UserWarnings
warnings.filterwarnings("ignore", category=UserWarning, message="A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy")
warnings.filterwarnings("ignore", category=UserWarning, message="unable to load libtensorflow_io_plugins.so")
warnings.filterwarnings("ignore", category=UserWarning, message="file system plugins are not loaded")

In [3]:
# Accuracy metrics from Scikit-Learn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

In [4]:
# Hugging Face library
from datasets import Dataset, DatasetDict

In [5]:
# Text analysis libraries
import nltk
import spacy
import re

# 1. Load Dataset

In [6]:
# Create a function to import the data from xlxs format

def load_data(file_path):
    return pd.read_excel(file_path, header=None,names = ['ID','DATE','CHANNEL','text','sentiment','emotion','info'], skiprows=1)

df_path = '/kaggle/input/peaks-sentiment-emotion/Peaks_sentiment_emotion.xlsx'

df = load_data(df_path)

In [7]:
# To get an idea of the data
pd.set_option('display.max_colwidth', 150)
df.head()

Unnamed: 0,ID,DATE,CHANNEL,text,sentiment,emotion,info
0,cdec5aa6fae29c87d2f736322cfe70c2,2020-06-11T00:00:00Z,FEED,"Ministro Speranza: ""l'alienazione parentale è un problema relazionale Genitore-figlio come come ci Ministro della Salute Roberto Speranza.X CLOSEM...",-,-,-
1,fc3f6327b8e2840a1bf798e5c89ddf93,2020-06-11T00:00:00Z,FEED,"""Esce con l'amante"". Uccide l'ex moglie ma lei andava a curarsi - IlGiornale.it ""Esce con l'amante"". Uccide l'ex moglie ma lei andava a curarsiPri...",-,-,-
2,d4d2b0342f900b9092651d5662fdabe6,2020-06-11T00:00:00Z,FEED,"Civita di Bagnoregio, ingresso gratuito per medici e infermieri | Viterbo Post Home Tempo libero Civita di Bagnoregio, ingresso gratuito per medic...",-,-,-
3,c57ccff390c3035163513431b48f1c7b,2020-06-11T00:00:00Z,FEED,"PIPER CLUB apre virtualmente con Mix Video Show – Dea Notizie 11 Giugno 2020Non la classica diretta improntata su un singolo artista, quella ideat...",-,-,-
4,d4d71c15c8056f482ad54743842005a6,2020-06-11T00:00:00Z,FEED,"Cei: Omofobia, non serve una legge . Zan: Critiche a un testo su cui stiamo ancora lavorando - GAYNEWS Dopo l'intervento di ieri del vescovo di...",-,-,-


In [8]:
df = df[~((df['sentiment'] == 'UNPREDICTABLE') | (df['emotion'] == 'UNPREDICTABLE'))]

df = df[(df['sentiment'] != ' - ') & (df['emotion'] != ' - ') & (df['info'] != ' - ')]

df.reset_index(drop=True, inplace=True)

In [9]:
# Remove rows with NaN values
df = df.dropna()

In [10]:
# Check whether there are duplicates in the dataframe

print(df.shape)

duplicates = df.duplicated()

duplicate_count = duplicates.value_counts()
print(duplicate_count)

duplicate_lines = df[duplicates].index
for line in duplicate_lines:
    print(f"Duplicate row at line {line}:")
    print(df.loc[line])

df = df.drop_duplicates()

print(df.shape)

(26687, 7)
False    26687
Name: count, dtype: int64
(26687, 7)


In [11]:
df = df[['text', 'sentiment']]

In [12]:
def converter(df): 
    mapping = {'NEG':'negative', 'NEU':'neutral', 'POS':'positive'} 
    df['sentiment'] = df['sentiment'].replace(mapping) 
    return df

df = converter(df)

In [13]:
df.head()

Unnamed: 0,text,sentiment
0,"VIERNES A LAS 18:30La esperada desescalada del confinamiento por fin está llegando, probablemente la mayoría de las personas no pensaron que el es...",negative
1,Worth the rain #fishing #carpfishing #carp #stalking #parklakes #parklakefishing #urbanfishing #angling #commoncarp,neutral
2,#esprit migrateur #chevreuils #eatmeat #chasse #chassejusquaubout #hunt #hunter #hawke #bbq #approche #stalking #jaitoutmangé,positive
3,Morning briefing then on the bus! #fieldsportsphotographer #fieldsportsphotography #fieldsports #inthefield #shootingtimes #shooting #hunting #sta...,positive
4,-sii sempre te stessa! 🌊✨.....ᴄᴏᴍᴇ ᴀᴠᴇʀᴇ ʟᴀ ᴠᴏꜱᴛʀᴀ ᴅᴇᴅɪᴄᴀ ᴘᴇʀꜱᴏɴᴀʟɪᴢᴢᴀᴛᴀ:1.seguite questa page2.scriveteci in direct il tipo di dedica che desider...,positive


In [14]:
dataset = Dataset.from_pandas(df)

print(dataset)

Dataset({
    features: ['text', 'sentiment', '__index_level_0__'],
    num_rows: 26687
})


# 2. Spacy Prerocessing

In [15]:
!python -m spacy download it_core_news_lg

Collecting it-core-news-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.6.0/it_core_news_lg-3.6.0-py3-none-any.whl (567.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')


In [16]:
# Load the Italian language model
nlp = spacy.load('it_core_news_lg')
italian_stopwords = nlp.Defaults.stop_words


# Define a function to preprocess text
def preprocess_text(text):
    # Remove punctuation, URLs, and user mentions
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    
    # Analyze the text to create a Doc object
    doc = nlp(text)
    
    # Prerocess the text
    text = [token.lemma_ for token in doc if token.lemma_ not in italian_stopwords]
    
    return text



def preprocess_dataset(dataset):
    dataset['text'] = preprocess_text(dataset['text'])
    return dataset

dataset = dataset.map(preprocess_dataset)

  0%|          | 0/26687 [00:00<?, ?ex/s]

# 3. Sentix

In [17]:
sentix = pd.read_csv('/kaggle/input/sentix/sentix', sep='\t', names=['lemma','POS','Wordnet synset ID','positive score','negative score','polarity','intensity'])

In [18]:
sentix.head()

Unnamed: 0,lemma,POS,Wordnet synset ID,positive score,negative score,polarity,intensity
0,abile,a,1740,0.125,0.0,1.0,0.125
1,intelligente,a,1740,0.125,0.0,1.0,0.125
2,valente,a,1740,0.125,0.0,1.0,0.125
3,capace,a,1740,0.125,0.0,1.0,0.125
4,incapace,a,2098,0.0,0.75,-1.0,0.75


In [19]:
# Create a dictionary from the DataFrame
italian_lexicon = dict(zip(sentix['lemma'], sentix['polarity']))

In [20]:
def listofstrings(dataset):
    dataset['text'] = ' '.join(dataset['text'])
    return dataset

dataset = dataset.map(listofstrings)

  0%|          | 0/26687 [00:00<?, ?ex/s]

In [21]:
dataset['text'][0:2]

['VIERNES LAS 1830La esperado desescalada di il confinamiento por está llegando probablemente Mayoría de las personas pensaron que el estado de Alarma iba durar Algunas Menos ingenuas intuíamos precisamente contrario precisamente nos preguntamos cómo es que el procendere de pandemia pillado en tan buenas Fechas pensándolo en términos turísticosfeminismo feminist feminismoinclusivo Feminist feminista historiadelfeminismo Mujereshistoricas machismo patriarcado misoginia machismocotidiano capitale capitalismo desescalada coronavirusespaña COVID19',
 'Worth the rain fishing carpfishing carp stalking Parklakes parklakefishing urbanfishing angling commoncarp']

# 4. VADER 

In [22]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.lexicon.update(italian_lexicon)



In [23]:
# Analyze Italian text
italian_text = "Questo è un esempio di classificazione."
sentiment_scores = sia.polarity_scores(italian_text)

# The sentiment_scores will contain the sentiment polarity (positive, negative, neutral) and a compound score.
print(sentiment_scores)

{'neg': 0.286, 'neu': 0.429, 'pos': 0.286, 'compound': 0.0}


In [24]:
def format_output(output_dict):
    polarity = "neutral"
    
    if(output_dict['compound']>= 0.05):
        polarity = "positive"
    
    elif(output_dict['compound']<= -0.05):
        polarity = "negative"
    
    return polarity

In [25]:
def vader(dataset):
    dataset['vader_prediction'] = sia.polarity_scores(dataset['text'])
    dataset['vader_prediction'] = format_output(dataset['vader_prediction'])
    return dataset

dataset = dataset.map(vader)

  0%|          | 0/26687 [00:00<?, ?ex/s]

# 5. Metrics

In [26]:
predicted_labels = dataset['vader_prediction']

label_test = dataset['sentiment']

In [27]:
print(classification_report(label_test,predicted_labels))

              precision    recall  f1-score   support

    negative       0.38      0.25      0.30     12552
     neutral       0.38      0.12      0.18      6379
    positive       0.27      0.56      0.36      7756

    accuracy                           0.31     26687
   macro avg       0.34      0.31      0.28     26687
weighted avg       0.34      0.31      0.29     26687



In [28]:
accuracy = accuracy_score(label_test, predicted_labels) # (TP+TN)/P+N i.e total number of corrected classified tweet over total number of tweets

print(accuracy)

0.31015100985498556


In [29]:
precision = precision_score(label_test, predicted_labels,average=None, labels=['negative','neutral','positive']) # TP/(TP+FP) i.e if predicted a certain class, which is the probability of being really that class?

print(precision)

[0.37542582 0.37943262 0.26740741]


In [30]:
recall = recall_score(label_test, predicted_labels,average=None, labels=['negative','neutral','positive']) # TP/(TP+FN) i.e the ability of the estimator to predict all the tweets of a given class

print(recall)

[0.25462078 0.11741652 0.55853533]


In [31]:
f1score = f1_score(label_test, predicted_labels,average=None, labels=['negative','neutral','positive']) # 2*(precision*recall)/(precision+recall)

print(f1score)

[0.30344173 0.17933677 0.36166305]
