In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
dataset = load_dataset("mteb/tweet_sentiment_extraction")
df = pd.DataFrame(dataset['train'])

In [None]:
print(df.head())

           id                                               text  label  \
0  cb774db0d1                I`d have responded, if I were going      1   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!      0   
2  088c60f138                          my boss is bullying me...      0   
3  9642c003ef                     what interview! leave me alone      0   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...      0   

  label_text  
0    neutral  
1   negative  
2   negative  
3   negative  
4   negative  


In [None]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("openai-gpt")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(40479, 768)

In [None]:
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding = "max_length", truncation=True, max_length=512)
    return tokenized

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir= "test_trainer",
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps=4
)

In [None]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

PREPARING DATA FOR RLHF

In [None]:
from datasets import load_dataset

prompt_data = load_dataset("center-for-humans-and-machines/rlhf-hackathon-prompts", split="train")
prompt_data['prompt'][0]

'How important is climate change?'

In [None]:
preference_data = load_dataset("trl-internal-testing/hh-rlhf-helpful-base-trl-style", split="train")

Processing the Preference Dataset

In [None]:
def extract_prompt(text):
    prompt = text[0]["content"]
    return prompt

In [None]:
preference_data_with_prompt = preference_data.map(
    lambda x: {"prompt": extract_prompt(x["chosen"])}
)


In [None]:
sample = preference_data_with_prompt.select(range(1))
sample['prompt']

['Hi, I want to learn to play horseshoes. Can you teach me?']

In [None]:
sample['chosen']

[[{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?',
   'role': 'user'},
  {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.',
   'role': 'assistant'},
  {'content': 'Okay. What else is needed to play, and what are the rules?',
   'role': 'user'},
  {'content': 'A horseshoe is usually made out of metal and is about 3 to 3.5 inches long and around 1 inch thick. The horseshoe should also have a 2 inch by 3 inch flat at the bottom where the rubber meets the metal. We also need two stakes and six horseshoes.',
   'role': 'assistant'}]]

Activate Learning

In [None]:
!pip install git+https://github.com/modAL-python/modAL.git

Collecting git+https://github.com/modAL-python/modAL.git
  Cloning https://github.com/modAL-python/modAL.git to /tmp/pip-req-build-up_5mkwp
  Running command git clone --filter=blob:none --quiet https://github.com/modAL-python/modAL.git /tmp/pip-req-build-up_5mkwp
  Resolved https://github.com/modAL-python/modAL.git to commit bba6f6fd00dbb862b1e09259b78caf6cffa2e755
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import modAL
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = dataset['train']['text']
y = dataset['train']['label']
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
X_tokenized = tokenizer(X, padding=True, truncation=True, return_tensors="np")["input_ids"]
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_tokenized, y, test_size=0.5)
print("Etiketli veri:", X_labeled[:5])
print("Etiketsiz veri:", X_unlabeled[:5])

learner = ActiveLearner(
    estimator = LogisticRegression(),
    query_strategy=uncertainty_sampling,
    X_training = X_labeled,
    y_training = y_labeled
)

query_idx, query_inst = learner.query(X_unlabeled)
print("Modelin seçtiği etiketsiz örnek:", query_inst)

Etiketli veri: [[  101 21862  2015  1010  1045  2572  3048  1006  2066  2073  1045  1036
   1049  3048  2205  1010  2021  1996  5025  3048  1010  1057  5603  1007
   4299  1045  2071  2175  2205   999   102     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0]
 [  101  8299  1024  1013  1013  4714  3126  2140  1012  4012  1013  4185
  18153  2099  2581  1024  1024  2013  8823 11928  2721  1051 26468  2080
   1997  1996  1012 10098 14854  2594  1012  2033  1004  1004 18823  1011
   4223  1012  5658   102     0     0     0     0     0     0     0     0
      0 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import numpy as np
new_label = 1  # Bu etiket, domain uzmanı tarafından belirlenir

# Yeni veriyi etiketli kümeye ekle
X_labeled = list(X_labeled) + [X_unlabeled[query_idx[0]]]
y_labeled = list(y_labeled) + [new_label]

# Etiketsiz veri setini güncelle
X_unlabeled = np.delete(X_unlabeled, query_idx, axis=0)

In [None]:
for _ in range(10):
    learner.teach(X_labeled, y_labeled)

    # Model etiketsiz veriden bir örnek seçer
    query_idx, query_inst = learner.query(X_unlabeled)

    # query_idx bir numpy dizisi olduğundan, tek değerini alıyoruz
    query_idx = query_idx[0]

    # Yeni örneği etiketli veri setine ekle
    X_labeled = np.vstack([X_labeled, X_unlabeled[query_idx]])
    y_labeled = np.append(y_labeled, y_unlabeled[query_idx])

    # Seçilen örneği etiketsiz veri setinden çıkar
    X_unlabeled = np.delete(X_unlabeled, query_idx, axis=0)
    y_unlabeled = np.delete(y_unlabeled, query_idx, axis=0)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt