# 0. Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import warnings

# Filter out the specific UserWarnings
warnings.filterwarnings("ignore", category=UserWarning, message="A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy")
warnings.filterwarnings("ignore", category=UserWarning, message="unable to load libtensorflow_io_plugins.so")
warnings.filterwarnings("ignore", category=UserWarning, message="file system plugins are not loaded")

In [None]:
# Hugging Face library
from transformers import AutoTokenizer, TFAutoModel

In [None]:
# Hugging Face library
from datasets import Dataset, DatasetDict

In [None]:
import re
import nltk

# 1. Load Datasets

In [None]:
# Create a function to import the data from csv format
def load_data(file_path):
    return pd.read_csv(file_path, header=None, delimiter='\t', names=['sentiment', 'text'])


train_path = '/kaggle/input/sentiment/train_bal_vdg_27_11.tsv'
test_path = '/kaggle/input/sentiment/test_bal_vdg_27_11.tsv'
val_path = '/kaggle/input/sentiment/valid_bal_vdg_27_11.tsv'

df_train = load_data(train_path)
df_test = load_data(test_path)
df_val = load_data(val_path)

In [None]:
# To get an idea of the data
pd.set_option('display.max_colwidth', 150)
df_train.head()

In [None]:
# I'm combining the pandas dataframe to the dataset dictionary of Hugging Face

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
val_dataset = Dataset.from_pandas(df_val)

# Create the DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset, 'validation': val_dataset})

print(dataset)

## 1.5 Check Duplicates

In [None]:
# Initialize a dictionary to store updated datasets
updated_datasets = {}

# Check for and remove duplicates in each split
for split in dataset.keys():
    split_data = dataset[split]
    
    # Access the 'text' column within the list
    text_column = split_data['text']
    
    # Initialize a set to track unique texts
    unique_texts = set()
    
    # Initialize lists to store the filtered data
    filtered_text = []
    
    # Iterate through the 'text' column and filter duplicates
    for text in text_column:
        if text not in unique_texts:
            unique_texts.add(text)
            filtered_text.append(text)
    
    # Create a new Dataset object with the filtered data
    updated_datasets[split] = split_data.select(list(range(len(filtered_text))))
    
    # Print the number of removed duplicates
    duplicate_count = len(text_column) - len(filtered_text)
    print(f"Duplicates removed in {split} split: {duplicate_count}\n")

# Update the dataset dictionary with the filtered datasets
dataset.update(updated_datasets)

# Print the updated dataset information
for split in dataset.keys():
    split_data = dataset[split]
    print(f"{split}: {len(split_data['text'])} rows")

print(dataset)

# 2. Sentiment Distribution

In [None]:
from collections import Counter

# Initialize dictionaries to store sentiment counts and categories for each split
sentiment_counts = {}
sentiment_categories = {}

# Loop through each split
for split in dataset.keys():
    split_data = dataset[split]
    
    # Extract sentiment data from the 'sentiment' column within the list
    sentiment_data = [item for item in split_data['sentiment']]
    
    # Calculate sentiment counts for the current split using Counter
    sentiment_count = Counter(sentiment_data)
    sentiment_counts[split] = sentiment_count
    
    # Get unique sentiment categories for the current split
    sentiment_category = list(sentiment_count.keys())
    sentiment_categories[split] = sentiment_category

# Combine all unique sentiment categories across all splits and sort them
all_sentiment_categories = sorted(set().union(*sentiment_categories.values()))

# Create subplots for each split
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Loop through each split and plot the sentiment counts
for i, split in enumerate(dataset.keys()):
    x = np.array(all_sentiment_categories)
    y = np.array([sentiment_counts[split].get(category, 0) for category in x])
    axs[i].bar(x, y)
    
    axs[i].set_title(f"Sentiment Distribution ({split} split)")
    axs[i].set_xlabel("Sentiment category")
    axs[i].set_ylabel("Number of tweets")

plt.tight_layout()
plt.show()

# 3. WordCloud

In [None]:
!python -m spacy download it_core_news_md

In [None]:
import spacy

# Load the Italian language model
nlp = spacy.load("it_core_news_md")

In [None]:
from nltk.corpus import stopwords
italian_stopwords = set(stopwords.words('italian'))

from nltk.tokenize import word_tokenize 

# Define a function to preprocess text
def preprocess_text(text):
    # Remove punctuation, URLs, and user mentions
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    
    # Process text using spaCy
    doc = nlp(text)
    
    # Filter out only nouns and lemmatize them
    lemmatized_nouns = [token.lemma_ for token in doc  if token.pos_ == 'NOUN']
    
    # Remove stopwords
    lemmatized_nouns = [word for word in lemmatized_nouns if word.lower() not in italian_stopwords]
    
    # Join the filtered and lemmatized nouns into a string
    text = ' '.join(lemmatized_nouns)
    
    return text





def preprocess_dataset(dataset):
    dataset['text'] = preprocess_text(dataset['text'])
    return dataset

dataset = dataset.map(preprocess_dataset)

In [None]:
from wordcloud import WordCloud

In [None]:
# Create a WordCloud object

# Combine the text from 'train', 'test', and 'validation' splits
combined_text = []

for split in ['train', 'test', 'validation']:
    combined_text.extend([text for text in dataset[split]['text']])

# Concatenate the combined text into a single string
text = ' '.join(combined_text)

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  # Turn off axis labels
plt.show()

# 4. Checking word distribution among classes

In [None]:
# Create empty lists for each class
neg_text = []
neu_text = []
pos_text = []


for split in dataset.keys():
    for text, label in zip(dataset[split]['text'], dataset[split]['sentiment']):
        # Check the label and append the text to the corresponding list
        if label == 'NEG':
            neg_text.append(text)
        elif label == 'NEU':
            neu_text.append(text)
        elif label == 'POS':
            pos_text.append(text)

In [None]:
# All words contained in "category tweets"
negative = []
neutral = []
positive = []

# Process negative text
for sentence in neg_text:
    doc = nlp(sentence)
    nouns = [token.text for token in doc if token.pos_ == 'NOUN']
    negative.extend(nouns)

# Process neutral text
for sentence in neu_text:
    doc = nlp(sentence)
    nouns = [token.text for token in doc if token.pos_ == 'NOUN']
    neutral.extend(nouns)

# Process positive text
for sentence in pos_text:
    doc = nlp(sentence)
    nouns = [token.text for token in doc if token.pos_ == 'NOUN']
    positive.extend(nouns)


In [None]:
from nltk.probability import FreqDist


fdist_neg = FreqDist(negative)
fdist_neu = FreqDist(neutral)
fdist_pos = FreqDist(positive)

In [None]:
# Get the most common words for each category
top_words_neg = fdist_neg.most_common(15)
top_words_neu = fdist_neu.most_common(15)
top_words_pos = fdist_pos.most_common(15)

# Unzip the top words and frequencies
top_words_neg, frequencies_neg = zip(*top_words_neg)
top_words_neu, frequencies_neu = zip(*top_words_neu)
top_words_pos, frequencies_pos = zip(*top_words_pos)

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot for negative words
axes[0].barh(top_words_neg, frequencies_neg)
axes[0].set_xlabel('Frequency')
axes[0].set_ylabel('Words')
axes[0].set_title('Top 5 Negative Words')
axes[0].invert_yaxis()

# Plot for neutral words
axes[1].barh(top_words_neu, frequencies_neu)
axes[1].set_xlabel('Frequency')
axes[1].set_ylabel('Words')
axes[1].set_title('Top 5 Neutral Words')
axes[1].invert_yaxis()

# Plot for positive words
axes[2].barh(top_words_pos, frequencies_pos)
axes[2].set_xlabel('Frequency')
axes[2].set_ylabel('Words')
axes[2].set_title('Top 5 Positive Words')
axes[2].invert_yaxis()

plt.tight_layout()
plt.show()

# 5. Similarity Index

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a TF-IDF vectorizer and fit it on all tweets
tfidf_vectorizer = TfidfVectorizer()
all_tweets = negative + neutral + positive
tfidf_matrix = tfidf_vectorizer.fit_transform(all_tweets)

# Calculate the cosine similarity between TF-IDF vectors
cosine_neg_neu = cosine_similarity(tfidf_matrix[:len(negative)], tfidf_matrix[len(negative):len(negative) + len(neutral)])
cosine_neg_pos = cosine_similarity(tfidf_matrix[:len(negative)], tfidf_matrix[len(negative) + len(neutral):])
cosine_neu_pos = cosine_similarity(tfidf_matrix[len(negative):len(negative) + len(neutral)], tfidf_matrix[len(negative) + len(neutral):])

avg_cosine_neg_neu = np.mean(cosine_neg_neu)
avg_cosine_neg_pos = np.mean(cosine_neg_pos)
avg_cosine_neu_pos = np.mean(cosine_neu_pos)

print("Average Cosine Similarity:")
print("NEG vs. NEU:", avg_cosine_neg_neu)
print("NEG vs. POS:", avg_cosine_neg_pos)
print("NEU vs. POS:", avg_cosine_neu_pos)