<a href="https://colab.research.google.com/github/Felipeitb/2024-bracciano-iris/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create dataset

In [None]:
!pip install transformers datasets

In [3]:
from datasets import load_dataset

In [5]:
v01_files = ["v01/m00 - m00-1.csv",
    "v01/m00 - m00-2.csv",
    "v01/m00 - m00-3.csv",
    "v01/m00 - m00-4.csv",
    "v01/m01 - m01-1.csv",
    "v01/m02 - m02-1.csv",
    "v01/m03 - m03-1.csv",
    "v01/m04 - m04-1.csv"]
dataset = load_dataset("csv",data_files=v01_files)

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment'],
        num_rows: 349
    })
})

In [8]:
sample = dataset["train"].shuffle(seed=200).select(range(10))
print(sample[:3])

{'text': ['Possono aiutare nella ricerca di informazioni specifiche per ricerche o cose del genere, ma talvolta le informazioni ricercate dall’intelligenza artificiale non sono sempre giuste', 'gita alla basilica di San Paolo a Roma', 'Quando i professori impongono con forza ritmi quasi impossibili '], 'sentiment': ['neutral', 'positive', 'negative']}


In [9]:
dataset_clean = dataset["train"].train_test_split(train_size=0.8, seed=200)

In [11]:
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment'],
        num_rows: 279
    })
    test: Dataset({
        features: ['text', 'sentiment'],
        num_rows: 70
    })
})

In [10]:
dataset_clean.save_to_disk('v01/dataset_clean')

Saving the dataset (0/1 shards):   0%|          | 0/279 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

# Upload dataset

In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
dataset_clean.push_to_hub("sentiment-analysis-test")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/427 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/Felipeit/sentiment-analysis-test/commit/ba95239371940a703766ff0534b89444189b73d7', commit_message='Upload dataset', commit_description='', oid='ba95239371940a703766ff0534b89444189b73d7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Felipeit/sentiment-analysis-test', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Felipeit/sentiment-analysis-test'), pr_revision=None, pr_num=None)

# Tokenize it!

In [15]:
from transformers import AutoTokenizer
from datasets import load_from_disk

dataset = load_from_disk("v01/dataset_clean")

tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/feel-it-italian-sentiment")



tokenizer_config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/794k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

In [17]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

Map:   0%|          | 0/279 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 279
    })
    test: Dataset({
        features: ['text', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 70
    })
})


In [20]:
samples = tokenized_dataset["train"].shuffle(seed=200).select(range(10))
samples [:3]

{'text': ['Una verifica andata bene a latino per la quale ho lavorato sodo',
  'Quando si ottengono risultati positivi con gli studenti',
  "tanta ansia e l'eccessivo studio che a volte non viene riconosciuto "],
 'sentiment': ['positive', 'positive', 'negative'],
 'input_ids': [[5,
   1018,
   3602,
   6183,
   741,
   13,
   10851,
   50,
   51,
   814,
   481,
   6799,
   29648,
   6],
  [5, 2027, 98, 21170, 2139, 9784, 49, 267, 2918, 6],
  [5,
   6262,
   12772,
   26,
   80,
   31932,
   23290,
   124,
   1651,
   58,
   13,
   1334,
   126,
   858,
   7783,
   31897,
   6]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}