In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

# Charger  données
data = pd.read_parquet('G:\Mon Drive\data\human_labeled\labeled_part_1.parquet')

In [9]:
datah = data.head(100)
datah.rename(columns={'Sentiment': 'label'}, inplace=True)
datah

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datah.rename(columns={'Sentiment': 'label'}, inplace=True)


Unnamed: 0,Date,text,label
0,2019-05-27,È appena uscito un nuovo video! LES CRYPTOMONN...,Positive
1,2019-05-27,Cardano: Digitize Currencies; EOS https://t.co...,Positive
2,2019-05-27,Another Test tweet that wasn't caught in the s...,Positive
3,2019-05-27,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,Positive
4,2019-05-27,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,Positive
...,...,...,...
95,2019-05-27,Belensay ÇİN de yaşamıyor..\n\nÇİN deki GÜNDEM...,Positive
96,2019-05-27,Win + ❤️ https://t.co/3vfzboSaad,Positive
97,2019-05-27,Bitcoin prices hit $150 billion market cap for...,Positive
98,2019-05-27,焼けた笑\nさすがBTC頭おかしい…笑,Positive


In [13]:
data[('Sentiment')].unique()

array(['Positive', 'Negative'], dtype=object)

In [14]:

# Convert 'Positive'/'Negative' labels to numerical labels
datah['label'] = datah['label'].map({'Positive': 1, 'Negative': 0})

# Dividing the data into training, validation, and test sets
train_data, test_data = train_test_split(datah, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

# Converting DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data[['text', 'label']])
val_dataset = Dataset.from_pandas(val_data[['text', 'label']])
test_dataset = Dataset.from_pandas(test_data[['text', 'label']])

# Tokenizing the data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples['text'], padding='max_length', truncation=True)
    tokenized_inputs['labels'] = examples['label']
    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Formatting the datasets to include labels
def format_dataset(dataset):
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return dataset

tokenized_train_dataset = format_dataset(tokenized_train_dataset)
tokenized_val_dataset = format_dataset(tokenized_val_dataset)
tokenized_test_dataset = format_dataset(tokenized_test_dataset)

# Loading the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)

# Setting up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)

# Training the model
trainer.train()

# Evaluating the model on the validation set
val_results = trainer.evaluate()
print("Validation results:", val_results)

# Evaluating the model on the test set
test_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print("Test results:", test_results)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datah['label'] = datah['label'].map({'Positive': 1, 'Negative': 0})


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss


Validation results: {'eval_loss': 1.7370539903640747, 'eval_runtime': 5.4603, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.183, 'epoch': 3.0}
Test results: {'eval_loss': 2.866410493850708, 'eval_runtime': 12.8377, 'eval_samples_per_second': 1.558, 'eval_steps_per_second': 0.156, 'epoch': 3.0}


In [16]:

model.save_pretrained('G:\\Mon Drive\\data\\human_labeled\\model1000_FinanceDistilBERT')
tokenizer.save_pretrained('G:\\Mon Drive\\data\\human_labeled\\tokenizer1000_FinanceDistilBERT')

('G:\\Mon Drive\\data\\human_labeled\\tokenizer1000_FinanceDistilBERT\\tokenizer_config.json',
 'G:\\Mon Drive\\data\\human_labeled\\tokenizer1000_FinanceDistilBERT\\special_tokens_map.json',
 'G:\\Mon Drive\\data\\human_labeled\\tokenizer1000_FinanceDistilBERT\\vocab.txt',
 'G:\\Mon Drive\\data\\human_labeled\\tokenizer1000_FinanceDistilBERT\\added_tokens.json')