# <h1 align="center">🤗<font color="yellow">Few-Shot Text Classification with SetFit ⚙️</font>🤗</h1>

<font color="yellow">Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro</font>

O treinamento foi feito no Colab 🤗.

# Importamos as nossas Bibliotecas

In [1]:
import numpy as np
import pandas as pd

from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer, sample_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Datasets

## Dataset de `train`

In [2]:
# Nossos Dados podem ser baixados aqui --> https://github.com/thisislohith6/Sentiment-Analysis-of-Movie-review-dataset

train = pd.read_csv('./train.tsv', sep='\t')
train.head()


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
train.shape

(156060, 4)

In [4]:
train['Sentiment'].value_counts()

Sentiment
2    79582
3    32927
1    27273
4     9206
0     7072
Name: count, dtype: int64

In [5]:
train_form = train

In [6]:
train_form['sent_full'] = pd.DataFrame(train_form['Sentiment'].replace(0,'Negative').replace(1,'Somewhat negative').replace(2,'Neutral').replace(3,'Somewhat positive').replace(4,'Positive'))


In [7]:
train_form.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,sent_full
0,1,1,A series of escapades demonstrating the adage ...,1,Somewhat negative
1,2,1,A series of escapades demonstrating the adage ...,2,Neutral
2,3,1,A series,2,Neutral
3,4,1,A,2,Neutral
4,5,1,series,2,Neutral


In [8]:
train_pos_neg = train_form[(train_form.sent_full=='Positive') |   (train_form.sent_full=='Negative')].reset_index()


In [9]:
train_pos_neg.head()


Unnamed: 0,index,PhraseId,SentenceId,Phrase,Sentiment,sent_full
0,63,64,2,"This quiet , introspective and entertaining in...",4,Positive
1,66,67,2,"quiet , introspective and entertaining indepen...",4,Positive
2,74,75,2,entertaining,4,Positive
3,77,78,2,is worth seeking,4,Positive
4,101,102,3,would have a hard time sitting through this one,0,Negative


In [10]:
train_pos_neg.shape

(16278, 6)

In [11]:
train_pos_neg['Sentiment'].unique()

array([4, 0])

In [12]:
# Trocamos o número 4 por 1 (Eles representam o SENTIMENTO Positive)
train_pos_neg['Sentiment'] = pd.DataFrame(train_pos_neg['Sentiment'].replace(4,1))


In [13]:
df_train = train_pos_neg[['Phrase', 'Sentiment']]
df_train.head()

Unnamed: 0,Phrase,Sentiment
0,"This quiet , introspective and entertaining in...",1
1,"quiet , introspective and entertaining indepen...",1
2,entertaining,1
3,is worth seeking,1
4,would have a hard time sitting through this one,0


In [14]:
df_train.shape

(16278, 2)

In [15]:
df_train['Sentiment'].unique()

array([1, 0])

In [16]:
from sklearn.model_selection import train_test_split

train, eval = train_test_split(df_train, test_size=0.25, random_state=42, shuffle=True) 


In [17]:
from datasets import Dataset


train_ds = Dataset.from_pandas(train, split="train", preserve_index=False)
eval_ds = Dataset.from_pandas(eval, split="eval", preserve_index=False)


In [18]:
pd.DataFrame(train_ds)

Unnamed: 0,Phrase,Sentiment
0,little doubt that Kidman has become one of our...,1
1,Michael Moore has perfected the art of highly ...,1
2,"with terrific computer graphics , inventive ac...",1
3,narrative filmmaking with a visually masterful...,1
4,found myself growing more and more frustrated ...,0
...,...,...
12203,fits the bill perfectly,1
12204,"A graceful , moving tribute",1
12205,Anyone who suffers through this film,0
12206,brings this unknown slice of history affecting...,1


In [19]:
pd.DataFrame(eval_ds)

Unnamed: 0,Phrase,Sentiment
0,essentially ruined --,0
1,critic-proof,1
2,is the script 's endless assault of embarrassi...,0
3,At 78 minutes it just zings along with vibranc...,1
4,My Sweet has so many flaws it would be easy fo...,0
...,...,...
4065,Scott delivers a terrific performance in this ...,1
4066,I have n't laughed that hard in years !,1
4067,A terrifically entertaining specimen of Spielb...,1
4068,make it an above-average thriller,1


In [20]:
train_ds

Dataset({
    features: ['Phrase', 'Sentiment'],
    num_rows: 12208
})

In [21]:
eval_ds

Dataset({
    features: ['Phrase', 'Sentiment'],
    num_rows: 4070
})

# Carregando o Modelo

In [22]:
# Carregamos um Modelo SetFit do Hub:

model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


# Trainer

In [28]:
# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20,  # O número de pares de texto a serem gerados para APRENDIZADO CONTRASTIVO
    num_epochs=10,  # O número de épocas a serem usadas para APRENDIZADO CONTRASTIVO
    column_mapping={"Phrase": "text", "Sentiment": "label"}  # Mapeamos as colunas do dataset text/label esperado pelo trainer
)


# Treinamos e Avaliamos

In [None]:
# Treinamos e avaliamos
trainer.train()

metrics = trainer.evaluate()

In [None]:
metrics


In [None]:
# save
trainer.model._save_pretrained(save_directory="./output/")


## Dataset de `test`

In [12]:
test = pd.read_csv('./test.tsv', sep='\t')
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [13]:
test.shape

(66292, 3)