# 0. Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import warnings

# Filter out the specific UserWarnings
warnings.filterwarnings("ignore", category=UserWarning, message="A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy")
warnings.filterwarnings("ignore", category=UserWarning, message="unable to load libtensorflow_io_plugins.so")
warnings.filterwarnings("ignore", category=UserWarning, message="file system plugins are not loaded")

In [None]:
# Accuracy metrics from Scikit-Learn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [None]:
# Hugging Face library
from datasets import Dataset, DatasetDict

In [None]:
# NLP libraries
import re
import nltk
import simplemma

from simplemma import text_lemmatizer
from nltk.corpus import stopwords

# 1. Load Dataset

In [None]:
# Create a function to import the data from csv format
def load_data(file_path):
    return pd.read_csv(file_path, header=None, delimiter='\t', names=['sentiment', 'text'])


train_path = '/kaggle/input/sentiment/train_bal_vdg_27_11.tsv'
test_path = '/kaggle/input/sentiment/test_bal_vdg_27_11.tsv'
val_path = '/kaggle/input/sentiment/valid_bal_vdg_27_11.tsv'

df_train = load_data(train_path)
df_test = load_data(test_path)
df_val = load_data(val_path)

In [None]:
def converter(df): 
    mapping = {'NEG':'negative', 'NEU':'neutral', 'POS':'positive'} 
    df['sentiment'] = df['sentiment'].replace(mapping) 
    return df

df_train = converter(df_train) 
df_val = converter(df_val) 
df_test = converter(df_test)

In [None]:
# To get an idea of the data
pd.set_option('display.max_colwidth', 150)
df_train.head()

In [None]:
# Remove user mention here. could not do it in the preprocess function
df_train['text'] = df_train['text'].str.replace('@[A-Za-z0-9]+\s?', '', regex=True)
df_val['text'] = df_val['text'].str.replace('@[A-Za-z0-9]+\s?', '', regex=True)
df_test['text'] = df_test['text'].str.replace('@[A-Za-z0-9]+\s?', '', regex=True)

In [None]:
# I'm combining the pandas dataframe to the dataset dictionary of Hugging Face

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
val_dataset = Dataset.from_pandas(df_val)

# Create the DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset, 'validation': val_dataset})

print(dataset)

In [None]:
# Removing duplicates

# Initialize a dictionary to store updated datasets
updated_datasets = {}

# Check for and remove duplicates in each split
for split in dataset.keys():
    split_data = dataset[split]
    
    # Access the 'text' column within the list
    text_column = split_data['text']
    
    # Initialize a set to track unique texts
    unique_texts = set()
    
    # Initialize lists to store the filtered data
    filtered_text = []
    
    # Iterate through the 'text' column and filter duplicates
    for text in text_column:
        if text not in unique_texts:
            unique_texts.add(text)
            filtered_text.append(text)
    
    # Create a new Dataset object with the filtered data
    updated_datasets[split] = split_data.select(list(range(len(filtered_text))))
    
    # Print the number of removed duplicates
    duplicate_count = len(text_column) - len(filtered_text)
    print(f"Duplicates removed in {split} split: {duplicate_count}\n")

# Update the dataset dictionary with the filtered datasets
dataset.update(updated_datasets)

# Print the updated dataset information
for split in dataset.keys():
    split_data = dataset[split]
    print(f"{split}: {len(split_data['text'])} rows")

print(dataset)

# 2. Data Prepocessing

In [None]:
italian_stopwords = set(stopwords.words('italian'))

# Define a function to preprocess text
def preprocess_text(text):    
    # Tokenization, lemmatization, removing punctuation, stopwords and URLs
    text = text_lemmatizer(text, lang='it')
    text = ' '.join(text)
    
    text = re.sub(r'[^\w\s\']', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    text = ' '.join(word for word in text.split() if word.lower() not in italian_stopwords)
    
    return text




def preprocess_dataset(dataset):
    dataset['text'] = preprocess_text(dataset['text'])
    return dataset

dataset = dataset.map(preprocess_dataset)

In [None]:
dataset['train']['text'][0:4]

# 3. Feature Extraction

## 3.1 TF-IDF 

In [None]:
# Convert the dataset to be ready for vectorization
X_train = np.array(dataset['train']['text'])
Y_train = np.array(dataset['train']['sentiment'])

X_val = np.array(dataset['validation']['text'])
Y_val = np.array(dataset['validation']['sentiment'])

X_test = np.array(dataset['test']['text'])
Y_test = np.array(dataset['test']['sentiment'])


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)


# 4. Support Vector Machine

In [None]:
# LinearSVC
from sklearn.svm import SVC
svm = SVC(random_state=0)

In [None]:
svm.fit(tf_x_train,Y_train)

y_test_svm=svm.predict(tf_x_test)

# 5. Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
nb.fit(tf_x_train,Y_train)

y_test_nb=nb.predict(tf_x_test)

# 6. Metrics

In [None]:
report_svm = classification_report(Y_test, y_test_svm)

report_nb = classification_report(Y_test, y_test_nb)

print("Support Vector Machine Classification Report:")
print(report_svm)

print("\nNaive Bayes Classification Report:")
print(report_nb)

In [None]:
accuracy_svm = accuracy_score(Y_test, y_test_svm) # (TP+TN)/P+N i.e total number of corrected classified tweet over total number of tweets

accuracy_nb = accuracy_score(Y_test, y_test_nb)

print("Support Vector Machine accuracy:", accuracy_svm)
print("Naive Bayes accuracy:", accuracy_nb)

In [None]:
precision_svm = precision_score(Y_test, y_test_svm,average=None, labels=['negative','neutral','positive']) # TP/(TP+FP) i.e if predicted a certain class, which is the probability of being really that class?

precision_nb = precision_score(Y_test, y_test_nb,average=None, labels=['negative','neutral','positive'])

print("Support Vector Machine precision:", precision_svm)
print("Naive Bayes precision:", precision_nb)

In [None]:
recall_svm = recall_score(Y_test, y_test_svm,average=None, labels=['negative','neutral','positive']) # TP/(TP+FN) i.e the ability of the estimator to predict all the tweets of a given class

recall_nb = recall_score(Y_test, y_test_nb,average=None, labels=['negative','neutral','positive'])


print("Support Vector Machine recall:", recall_svm)
print("Naive Bayes recall:", recall_nb)

In [None]:
f1score_svm = f1_score(Y_test, y_test_svm,average=None, labels=['negative','neutral','positive']) # 2*(precision*recall)/(precision+recall)

f1score_nb = f1_score(Y_test, y_test_nb,average=None, labels=['negative','neutral','positive'])


print("Support Vector Machine f1-score:", f1score_svm)
print("Naive Bayes f1-score:", f1score_nb)