# 0. Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import warnings

# Filter out the specific UserWarnings
warnings.filterwarnings("ignore", category=UserWarning, message="A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy")
warnings.filterwarnings("ignore", category=UserWarning, message="unable to load libtensorflow_io_plugins.so")
warnings.filterwarnings("ignore", category=UserWarning, message="file system plugins are not loaded")

In [None]:
# Hugging Face library
from transformers import AutoTokenizer, TFAutoModel

In [None]:
# Hugging Face library
from datasets import Dataset, DatasetDict

In [None]:
import re
import nltk

# 1. Load Datasets

In [None]:
# Create a function to import the data from csv format
def load_data(file_path):
    return pd.read_csv(file_path, header=None, delimiter='\t', names=['emotion', 'text'])


train_path = '/kaggle/input/emotion/train-emotion-all.tsv'
test_path = '/kaggle/input/emotion/test-emotion-all.tsv'
val_path = '/kaggle/input/emotion/valid-emotion-all.tsv'

df_train = load_data(train_path)
df_test = load_data(test_path)
df_val = load_data(val_path)

In [None]:
# To get an idea of the data
pd.set_option('display.max_colwidth', 150)
df_train.head()

In [None]:
# I'm combining the pandas dataframe to the dataset dictionary of Hugging Face

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
val_dataset = Dataset.from_pandas(df_val)

# Create the DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset, 'validation': val_dataset})

print(dataset)

## 1.5 Check Duplicates

In [None]:
# Initialize a dictionary to store updated datasets
updated_datasets = {}

# Check for and remove duplicates in each split
for split in dataset.keys():
    split_data = dataset[split]
    
    # Access the 'text' column within the list
    text_column = split_data['text']
    
    # Initialize a set to track unique texts
    unique_texts = set()
    
    # Initialize lists to store the filtered data
    filtered_text = []
    
    # Iterate through the 'text' column and filter duplicates
    for text in text_column:
        if text not in unique_texts:
            unique_texts.add(text)
            filtered_text.append(text)
    
    # Create a new Dataset object with the filtered data
    updated_datasets[split] = split_data.select(list(range(len(filtered_text))))
    
    # Print the number of removed duplicates
    duplicate_count = len(text_column) - len(filtered_text)
    print(f"Duplicates removed in {split} split: {duplicate_count}\n")

# Update the dataset dictionary with the filtered datasets
dataset.update(updated_datasets)

# Print the updated dataset information
for split in dataset.keys():
    split_data = dataset[split]
    print(f"{split}: {len(split_data['text'])} rows")

print(dataset)

# 2. Sentiment Distribution

In [None]:
from collections import Counter

# Initialize dictionaries to store sentiment counts and categories for each split
emotion_counts = {}
emotion_categories = {}

# Loop through each split
for split in dataset.keys():
    split_data = dataset[split]
    
    # Extract emotion data from the 'emotion' column within the list
    emotion_data = [item for item in split_data['emotion']]
    
    # Calculate sentiment counts for the current split using Counter
    emotion_count = Counter(emotion_data)
    emotion_counts[split] = emotion_count
    
    # Get unique emotion categories for the current split
    emotion_category = list(emotion_count.keys())
    emotion_categories[split] = emotion_category

# Combine all unique sentiment categories across all splits and sort them
all_emotion_categories = sorted(set().union(*emotion_categories.values()))

# Create subplots for each split
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

#max_y_value = 0
# Loop through each split and plot the sentiment counts
for i, split in enumerate(dataset.keys()):
    x = np.array(all_emotion_categories)
    y = np.array([emotion_counts[split].get(category, 0) for category in x])
    axs[i].bar(x, y)
    
    
    
    axs[i].set_title(f"Emotion Distribution ({split} split)")
    axs[i].set_xlabel("Emotion category")
    axs[i].set_ylabel("Number of tweets")
    
    axs[i].tick_params(axis='x', labelsize=7)
    #max_y_value = max(max_y_value, max(y))
    
# Set a common y-axis range for all subplots
#for i in range(3):
    #axs[i].set_ylim(0, max_y_value)

plt.tight_layout()
plt.show()

# 3. WordCloud

In [None]:
!python -m spacy download it_core_news_md

In [None]:
import spacy

# Load the Italian language model
nlp = spacy.load("it_core_news_md")

In [None]:
from nltk.corpus import stopwords
italian_stopwords = set(stopwords.words('italian'))

from nltk.tokenize import word_tokenize 

# Define a function to preprocess text
def preprocess_text(text):
    # Remove punctuation, URLs, and user mentions
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    
    # Process text using spaCy
    doc = nlp(text)
    
    # Filter out only nouns and lemmatize them
    lemmatized_nouns = [token.lemma_ for token in doc  if token.pos_ == 'NOUN']
    
    # Remove stopwords
    lemmatized_nouns = [word for word in lemmatized_nouns if word.lower() not in italian_stopwords]
    
    # Join the filtered and lemmatized nouns into a string
    text = ' '.join(lemmatized_nouns)
    
    return text





def preprocess_dataset(dataset):
    dataset['text'] = preprocess_text(dataset['text'])
    return dataset

dataset = dataset.map(preprocess_dataset)

In [None]:
from wordcloud import WordCloud

In [None]:
# Create a WordCloud object

# Combine the text from 'train', 'test', and 'validation' splits
combined_text = []

for split in ['train', 'test', 'validation']:
    combined_text.extend([text for text in dataset[split]['text']])

# Concatenate the combined text into a single string
text = ' '.join(combined_text)

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  # Turn off axis labels
plt.show()

# 4. Checking word distribution among classes

In [None]:
# Create empty lists for each class
tristezza_text = []
gioia_text = []
amore_text = []
rabbia_text = []
paura_text = []
sorpresa_text = []
neutra_text = []

for split in dataset.keys():
    for text, emotion in zip(dataset[split]['text'], dataset[split]['emotion']):
        # Check the emotion label and append the text to the corresponding list
        if emotion == 'TRISTEZZA':
            tristezza_text.append(text)
        elif emotion == 'GIOIA':
            gioia_text.append(text)
        elif emotion == 'AMORE':
            amore_text.append(text)
        elif emotion == 'RABBIA':
            rabbia_text.append(text)
        elif emotion == 'PAURA':
            paura_text.append(text)
        elif emotion == 'SORPRESA':
            sorpresa_text.append(text)
        elif emotion == 'NEUTRA':
            neutra_text.append(text)


In [None]:
# All words contained in "category tweets"
tristezza_nouns = []
gioia_nouns = []
amore_nouns = []
rabbia_nouns = []
paura_nouns = []
sorpresa_nouns = []
neutra_nouns = []

# Process text based on emotions
for split in dataset.keys():
    for text, emotion in zip(dataset[split]['text'], dataset[split]['emotion']):
        doc = nlp(text)
        nouns = [token.text for token in doc if token.pos_ == 'NOUN']
        
        if emotion == 'TRISTEZZA':
            tristezza_nouns.extend(nouns)
        elif emotion == 'GIOIA':
            gioia_nouns.extend(nouns)
        elif emotion == 'AMORE':
            amore_nouns.extend(nouns)
        elif emotion == 'RABBIA':
            rabbia_nouns.extend(nouns)
        elif emotion == 'PAURA':
            paura_nouns.extend(nouns)
        elif emotion == 'SORPRESA':
            sorpresa_nouns.extend(nouns)
        elif emotion == 'NEUTRA':
            neutra_nouns.extend(nouns)



In [None]:
from nltk.probability import FreqDist


fdist_tristezza = FreqDist(tristezza_nouns)
fdist_gioia = FreqDist(gioia_nouns)
fdist_amore = FreqDist(amore_nouns)
fdist_rabbia = FreqDist(rabbia_nouns)
fdist_paura = FreqDist(paura_nouns)
fdist_sorpresa = FreqDist(sorpresa_nouns)
fdist_neutra = FreqDist(neutra_nouns)

In [None]:
# Get the most common words for each emotion category
top_words_tristezza = fdist_tristezza.most_common(15)
top_words_gioia = fdist_gioia.most_common(15)
top_words_amore = fdist_amore.most_common(15)
top_words_rabbia = fdist_rabbia.most_common(15)
top_words_paura = fdist_paura.most_common(15)
top_words_sorpresa = fdist_sorpresa.most_common(15)
top_words_neutra = fdist_neutra.most_common(15)

# Unzip the top words and frequencies for each emotion
top_words_tristezza, frequencies_tristezza = zip(*top_words_tristezza)
top_words_gioia, frequencies_gioia = zip(*top_words_gioia)
top_words_amore, frequencies_amore = zip(*top_words_amore)
top_words_rabbia, frequencies_rabbia = zip(*top_words_rabbia)
top_words_paura, frequencies_paura = zip(*top_words_paura)
top_words_sorpresa, frequencies_sorpresa = zip(*top_words_sorpresa)
top_words_neutra, frequencies_neutra = zip(*top_words_neutra)

# Create subplots
fig, axes = plt.subplots(7, 1, figsize=(8, 30), sharex=True)

# Plot for Tristezza words
axes[0].barh(top_words_tristezza, frequencies_tristezza)
axes[0].set_xlabel('Frequency')
axes[0].set_ylabel('Words')
axes[0].set_title('Top 15 Tristezza Words')
axes[0].invert_yaxis()

# Plot for Gioia words
axes[1].barh(top_words_gioia, frequencies_gioia)
axes[1].set_xlabel('Frequency')
axes[1].set_ylabel('Words')
axes[1].set_title('Top 15 Gioia Words')
axes[1].invert_yaxis()

# Plot for Amore words
axes[2].barh(top_words_amore, frequencies_amore)
axes[2].set_xlabel('Frequency')
axes[2].set_ylabel('Words')
axes[2].set_title('Top 15 Amore Words')
axes[2].invert_yaxis()

# Plot for Rabbia words
axes[3].barh(top_words_rabbia, frequencies_rabbia)
axes[3].set_xlabel('Frequency')
axes[3].set_ylabel('Words')
axes[3].set_title('Top 15 Rabbia Words')
axes[3].invert_yaxis()

# Plot for Paura words
axes[4].barh(top_words_paura, frequencies_paura)
axes[4].set_xlabel('Frequency')
axes[4].set_ylabel('Words')
axes[4].set_title('Top 15 Paura Words')
axes[4].invert_yaxis()

# Plot for Sorpresa words
axes[5].barh(top_words_sorpresa, frequencies_sorpresa)
axes[5].set_xlabel('Frequency')
axes[5].set_ylabel('Words')
axes[5].set_title('Top 15 Sorpresa Words')
axes[5].invert_yaxis()

# Plot for Neutra words
axes[6].barh(top_words_neutra, frequencies_neutra)
axes[6].set_xlabel('Frequency')
axes[6].set_ylabel('Words')
axes[6].set_title('Top 15 Neutra Words')
axes[6].invert_yaxis()


plt.show()


# 5. Similarity Index

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a TF-IDF vectorizer and fit it on all tweets
tfidf_vectorizer = TfidfVectorizer()
all_tweets = negative + neutral + positive
tfidf_matrix = tfidf_vectorizer.fit_transform(all_tweets)

# Calculate the cosine similarity between TF-IDF vectors
cosine_neg_neu = cosine_similarity(tfidf_matrix[:len(negative)], tfidf_matrix[len(negative):len(negative) + len(neutral)])
cosine_neg_pos = cosine_similarity(tfidf_matrix[:len(negative)], tfidf_matrix[len(negative) + len(neutral):])
cosine_neu_pos = cosine_similarity(tfidf_matrix[len(negative):len(negative) + len(neutral)], tfidf_matrix[len(negative) + len(neutral):])

avg_cosine_neg_neu = np.mean(cosine_neg_neu)
avg_cosine_neg_pos = np.mean(cosine_neg_pos)
avg_cosine_neu_pos = np.mean(cosine_neu_pos)

print("Average Cosine Similarity:")
print("NEG vs. NEU:", avg_cosine_neg_neu)
print("NEG vs. POS:", avg_cosine_neg_pos)
print("NEU vs. POS:", avg_cosine_neu_pos)