In [None]:
%pip install accelerate peft transformers trl evaluate torch
%pip install -U bitsandbytes

Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.13.0-py3-none-any.whl (293 kB)
[2K   [90m━━

In [None]:
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    set_seed,
    TrainingArguments,
    PreTrainedTokenizerBase,
    BitsAndBytesConfig,
    TrainerCallback,
    pipeline,
    EarlyStoppingCallback,
    Trainer
)
from trl import (
    RewardTrainer,
    RewardConfig
)
from dataclasses import dataclass
from typing import (
    Any,
    Optional,
    Union
)
from datasets import (
    load_dataset,
    DatasetDict,
)
import numpy as np
import evaluate
import torch
from transformers.utils import PaddingStrategy
from sklearn.model_selection import KFold
import csv
import pandas as pd

In [None]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
def tokenize_function(batch):
    """Tokenize a batch from a reward modelling dataset."""
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }

    for prompt, chosen, rejected in zip(batch["prompt"], batch["chosen"], batch["rejected"]):
        # tokenized_chosen = tokenizer("Source: " + prompt + "\n\nChosen: " + chosen, truncation=True)
        # tokenized_rejected = tokenizer("Source: " + prompt + "\n\Chosen: " + rejected, truncation=True)
        kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}
        chosen_prompt = prompt + '\n' + chosen
        rejected_prompt = prompt + '\n' + rejected
        tokenized_chosen = tokenizer(chosen_prompt, **kwargs)
        tokenized_rejected = tokenizer(rejected_prompt, **kwargs)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"].squeeze(0))
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"].squeeze(0))
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"].squeeze(0))
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"].squeeze(0))

    return new_examples

In [None]:
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
        features_chosen = []
        features_rejected = []
        for feature in features:
            features_chosen.append(
                {
                    "input_ids": feature["input_ids_chosen"],
                    "attention_mask": feature["attention_mask_chosen"],
                }
            )
            features_rejected.append(
                {
                    "input_ids": feature["input_ids_rejected"],
                    "attention_mask": feature["attention_mask_rejected"],
                }
            )
        batch_chosen = self.tokenizer.pad(
            features_chosen,
            padding=self.padding,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_rejected = self.tokenizer.pad(
            features_rejected,
            padding=self.padding,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_chosen": batch_chosen["input_ids"],
            "attention_mask_chosen": batch_chosen["attention_mask"],
            "input_ids_rejected": batch_rejected["input_ids"],
            "attention_mask_rejected": batch_rejected["attention_mask"],
            "return_loss": True,
        }
        return batch

In [None]:
model_name = "google-bert/bert-base-uncased"
dataset_name = "reward_data.csv"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='none',
)

In [None]:
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

In [None]:
def model_init():
    if torch.cuda.is_available():
        model = AutoModelForSequenceClassification.from_pretrained(model_name, device_map=device, quantization_config=quantization_config)
        print('loaded GPU')
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name)

    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    return model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
dataset = load_dataset("csv", data_files=dataset_name)

ds_train_devtest = dataset['train'].train_test_split(test_size=50, seed=42)

dataset_splits = DatasetDict({
    'train': ds_train_devtest['train'],
    'test': ds_train_devtest['test'],
})

print("Before:\n", dataset)
print("After:\n", dataset_splits)

dataset_splits['test'].to_csv('reward_eval_dataset.csv')

Generating train split: 0 examples [00:00, ? examples/s]

Before:
 DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 450
    })
})
After:
 DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 400
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 50
    })
})


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

4811

In [None]:
    dataset_splits['train'] = dataset_splits['train'].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_splits['train'].column_names,
    )

    dataset_splits['test'] = dataset_splits['test'].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_splits['test'].column_names,
    )

    dataset = dataset_splits["train"]
    test_dataset = dataset_splits["test"]

    print(dataset)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 400
})


In [None]:
def compute_metrics(eval_pred):

  accuracy = evaluate.load("accuracy") # Load the accuracy metric here

  preds = np.argmax(eval_pred.predictions, axis=-1)
  labels = eval_pred.label_ids

  return accuracy.compute(predictions=preds, references=labels)

In [None]:
# # method
# sweep_config = {
#     'method': 'random'
# }

# # hyperparameters
# parameters_dict = {
#     'batch_size': {
#         'values': [8, 16, 32, 64, 80]
#         },
#     'learning_rate': {
#         'values': [5e-5, 4e-5, 3e-5]
#         },
#     'weight_decay': {
#         'values': [0, 0.01, 0.02]
#         },
#     }


# sweep_config['parameters'] = parameters_dict

In [None]:
# kf = KFold(n_splits=8, shuffle=True, random_state=42)
# fold_results = []

In [None]:
# def train(config=None):
#   with wandb.init(config=config):
#     # set sweep configuration
#     config = wandb.config

#     model = model_init()

#     # set training arguments
#     training_args = RewardConfig(
#         output_dir='reward_model',
# 	      report_to='wandb',  # Turn on Weights & Biases logging
#         num_train_epochs=1,
#         learning_rate=config.learning_rate,
#         weight_decay=config.weight_decay,
#         per_device_train_batch_size=config.batch_size,
#         per_device_eval_batch_size=config.batch_size,
#         save_strategy='epoch',
#         eval_strategy='epoch',
#         logging_strategy='epoch',
#         center_rewards_coefficient=0.01,
#         load_best_model_at_end=True,
#         remove_unused_columns=False,
#         fp16=True
#     )

#     # define training loop
#     trainer = RewardTrainer(
#         model=model,
#         args=training_args,
#         data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer),
#         processing_class=tokenizer,
#         train_dataset=dataset,
#         eval_dataset=test_dataset,
#         compute_metrics=compute_metrics
#     )

#     trainer.train_dataset = None
#     trainer.eval_dataset = None

#     for fold, (train_index, val_index) in enumerate(kf.split(dataset)):
#         print(f"Fold {fold + 1}")
#         train_dataset = dataset.select(train_index)
#         val_dataset = dataset.select(val_index)

#         trainer.train_dataset = train_dataset
#         trainer.eval_dataset = val_dataset

#         trainer.train()
#         eval_result = trainer.evaluate()

#         fold_results.append(eval_result)

#     avg_results = {metric: np.mean([result[metric] for result in fold_results]) for metric in fold_results[0]}
#     print("Average results across folds:", avg_results)


In [None]:
# sweep_id = wandb.sweep(sweep_config, project='reward_model_training')

In [None]:
# wandb.agent(sweep_id, train, count=20)

In [None]:
api = wandb.Api()
sweep = api.sweep('d-l-van-thulden-university-of-groningen/reward_model_training/sweeps/k31gw7pm')

best_run = sweep.best_run(order='eval/accuracy')
print(best_run.name)
best_parameters = best_run.config
print(best_parameters)

with open("mycsvfile.csv", "w", newline="") as f:
    w = csv.DictWriter(f, best_parameters.keys())
    w.writeheader()
    w.writerow(best_parameters)

[34m[1mwandb[0m: Sorting runs by -summary_metrics.eval/accuracy


comfy-sweep-1


In [None]:
with open("mycsvfile.csv", "r") as f:
    reader = csv.DictReader(f)
    print(reader)
    reward_config = [row for row in reader][0]

print(reward_config)

<csv.DictReader object at 0x7ea78b4967d0>


In [None]:
model = model_init()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loaded GPU
trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372


In [None]:
training_args = RewardConfig(
output_dir="reward_model/bert_config",
learning_rate=float(reward_config['learning_rate']),
num_train_epochs=1,
weight_decay=float(reward_config['weight_decay']),
eval_strategy='epoch',
logging_strategy='epoch',
save_strategy='epoch',
per_device_train_batch_size=int(reward_config['batch_size']),
per_device_eval_batch_size=int(reward_config['batch_size']),
optim=reward_config['optim'],
seed=int(reward_config['seed']),
fp16=reward_config['fp16'],
center_rewards_coefficient=float(0.01),
load_best_model_at_end = True,
save_total_limit = 1,
report_to='wandb'
)

In [None]:
trainer = RewardTrainer(
model=model,
args=training_args,
train_dataset=dataset,
eval_dataset=test_dataset,
processing_class=tokenizer,
data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=compute_metrics,
)

In [None]:
trainer.train_dataset = None
trainer.eval_dataset = None

In [None]:
kf = KFold(n_splits=8, shuffle=True, random_state=42)
epoch_results = []
fold_results = []

In [None]:
# for i in range(3):
for fold, (train_index, val_index) in enumerate(kf.split(dataset)):
  print(f"Fold {fold + 1}")

  # model = model_init()

  # trainer = RewardTrainer(
  #   model=model,
  #   args=training_args,
  #   train_dataset=dataset,
  #   eval_dataset=test_dataset,
  #   processing_class=tokenizer,
  #   data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer),
  #   compute_metrics=compute_metrics,
  #   )

  train_dataset = dataset.select(train_index)
  val_dataset = dataset.select(val_index)

  trainer.train_dataset = train_dataset
  trainer.eval_dataset = val_dataset

  trainer.train()
  eval_result = trainer.evaluate()

  fold_results.append(eval_result)

  avg_results = {metric: np.mean([result[metric] for result in fold_results]) for metric in fold_results[0]}
  print("Average results across folds:", avg_results)
  epoch_results.append(avg_results)

print("Average results across epochs:", epoch_results)

Fold 1


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7077,0.699289,0.4


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.6992886066436768, 'eval_accuracy': 0.4, 'eval_runtime': 2.062, 'eval_samples_per_second': 24.249, 'eval_steps_per_second': 0.485, 'epoch': 1.0}
Fold 2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7038,0.69656,0.44


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.6979241669178009, 'eval_accuracy': 0.42000000000000004, 'eval_runtime': 2.01785, 'eval_samples_per_second': 24.790999999999997, 'eval_steps_per_second': 0.496, 'epoch': 1.0}
Fold 3


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7019,0.696891,0.42


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.6975799202919006, 'eval_accuracy': 0.42, 'eval_runtime': 2.0362666666666667, 'eval_samples_per_second': 24.566999999999997, 'eval_steps_per_second': 0.49133333333333334, 'epoch': 1.0}
Fold 4


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6956,0.694,0.58


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.6966848224401474, 'eval_accuracy': 0.45999999999999996, 'eval_runtime': 2.015025, 'eval_samples_per_second': 24.831249999999997, 'eval_steps_per_second': 0.4965, 'epoch': 1.0}
Fold 5


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.699,0.691674,0.52


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.695682680606842, 'eval_accuracy': 0.472, 'eval_runtime': 2.01462, 'eval_samples_per_second': 24.8328, 'eval_steps_per_second': 0.49660000000000004, 'epoch': 1.0}
Fold 6


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6992,0.693344,0.58


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.6952929099400839, 'eval_accuracy': 0.49, 'eval_runtime': 2.01375, 'eval_samples_per_second': 24.841166666666666, 'eval_steps_per_second': 0.4968333333333333, 'epoch': 1.0}
Fold 7


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6892,0.697023,0.4


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.6955400790487017, 'eval_accuracy': 0.47714285714285715, 'eval_runtime': 2.011185714285714, 'eval_samples_per_second': 24.87142857142857, 'eval_steps_per_second': 0.4974285714285714, 'epoch': 1.0}
Fold 8


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.696,0.692433,0.48


  return fn(*args, **kwargs)


Average results across folds: {'eval_loss': 0.6951517313718796, 'eval_accuracy': 0.47750000000000004, 'eval_runtime': 2.0157499999999997, 'eval_samples_per_second': 24.814749999999997, 'eval_steps_per_second': 0.49624999999999997, 'epoch': 1.0}
Average results across epochs: [{'eval_loss': 0.6992886066436768, 'eval_accuracy': 0.4, 'eval_runtime': 2.062, 'eval_samples_per_second': 24.249, 'eval_steps_per_second': 0.485, 'epoch': 1.0}, {'eval_loss': 0.6979241669178009, 'eval_accuracy': 0.42000000000000004, 'eval_runtime': 2.01785, 'eval_samples_per_second': 24.790999999999997, 'eval_steps_per_second': 0.496, 'epoch': 1.0}, {'eval_loss': 0.6975799202919006, 'eval_accuracy': 0.42, 'eval_runtime': 2.0362666666666667, 'eval_samples_per_second': 24.566999999999997, 'eval_steps_per_second': 0.49133333333333334, 'epoch': 1.0}, {'eval_loss': 0.6966848224401474, 'eval_accuracy': 0.45999999999999996, 'eval_runtime': 2.015025, 'eval_samples_per_second': 24.831249999999997, 'eval_steps_per_second': 

In [None]:
metrics = trainer.evaluate()
print(metrics)
eval_accuracy = metrics["eval_accuracy"] * 100
print(f"Evaluation Accuracy: {eval_accuracy: .2f}%")

{'eval_loss': 0.6924332976341248, 'eval_accuracy': 0.48, 'eval_runtime': 1.9834, 'eval_samples_per_second': 25.209, 'eval_steps_per_second': 0.504, 'epoch': 1.0}
Evaluation Accuracy:  48.00%


In [None]:
trainer.eval_dataset = test_dataset
trainer.evaluate()

{'eval_loss': 0.6946930885314941,
 'eval_accuracy': 0.46,
 'eval_runtime': 1.9582,
 'eval_samples_per_second': 25.533,
 'eval_steps_per_second': 0.511,
 'epoch': 1.0}

In [None]:
model.save_pretrained("/content/reward_model/")
tokenizer.save_pretrained("/content/reward_model/tokenizer")

In [None]:
!zip -r /content/reward_model.zip /content/reward_model

In [None]:
test_dataset_raw = load_dataset("csv", data_files='reward_eval_dataset.csv', split='train')
test_dataset_raw

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 50
})

In [None]:
sentiment_pipe = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer
)

Device set to use cuda:0
The model 'PeftModelForSequenceClassification' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaFo

In [None]:
sent_kwargs = {"top_k": None, "function_to_apply": "none", "return_all_scores": True, "batch_size": 1}

In [None]:
win = 0
loss = 0
game_data = dict()
game_data['prompt'] = []
game_data['chosen'] = []
game_data['rejected'] = []
game_data['chosen_reward'] = []
game_data['rejected_reward'] = []
game_data['win'] = []
game_data['loss'] = []

for i in range(len(test_dataset_raw['prompt'])):

    prompt = test_dataset_raw['prompt'][i]
    chosen = test_dataset_raw['chosen'][i]
    rejected = test_dataset_raw['rejected'][i]

    chosen_text = prompt + '\n' + chosen
    chosen_output = sentiment_pipe(chosen_text, **sent_kwargs)
    chosen_reward = [output["score"] for output in chosen_output][0]

    rejected_text = prompt + '\n' + rejected
    rejected_output = sentiment_pipe(rejected_text, **sent_kwargs)
    rejected_reward = [output["score"] for output in rejected_output][0]

    if chosen_reward > rejected_reward:
        # print("Chosen is better")
        win += 1
        won = True
        lost = False
    else:
        # print("Rejected is better")
        loss += 1
        won = False
        lost = True

    game_data['prompt'].append(prompt)
    game_data['chosen'].append(chosen)
    game_data['rejected'].append(rejected)
    game_data['chosen_reward'].append(chosen_reward)
    game_data['rejected_reward'].append(rejected_reward)
    game_data['win'].append(won)
    game_data['loss'].append(lost)

ratio = win / (win + loss)
print(ratio)
print(win)
print(loss)

print(game_data)

df_results = pd.DataFrame(game_data)
df_results.to_csv('reward_results.csv')
df_results

0.5
25
25
{'prompt': ['It do matter you can keep her hand grip.', 'I do not need it divin.', 'He was here two weeks and i miss him.', 'Because it be get point.', 'You know i have haneve admire president nixon.', 'And they give you they will mail you reimbursements.', 'The people begin plead jesus to leave their region.', 'And we stayed couples years.', 'I will have jerry get them.', 'But you can not go walmart.', 'I do not need it divin.', 'Only heat you have in this room is one register there.', 'Do not fill your bag dear.', 'If you have studio apartment you are stuck it.', 'Hey who bought green cake.', 'I want make sure now.', 'And you turn it and put chocolate side.', 'And i guess he really go.', 'I do not need it divin.', 'Yes they put stuff that was three times money.', 'If you have studio apartment you are stuck it.', 'You can still hear planes.', 'And he comes the cafeterias.', 'I ate today than i have week long though.', 'I have much distance between us.', 'Do not make fun me.'

Unnamed: 0,prompt,chosen,rejected,chosen_reward,rejected_reward,win,loss
0,It do matter you can keep her hand grip.,You're not strong enough to hold her.,You're not strong enough to do it.,0.168457,0.165894,True,False
1,I do not need it divin.,I do not want it.,I do not need it.,0.130859,0.118225,True,False
2,He was here two weeks and i miss him.,He is a great person.,He is a great person and a good friend.,0.000873,-0.024979,True,False
3,Because it be get point.,I like to play the guitar.,I like to play a game.,0.106567,0.196777,False,True
4,You know i have haneve admire president nixon.,i think he was a great leader.,i think he's a great leader.,0.019669,0.020905,False,True
5,And they give you they will mail you reimburse...,They are a good company and they will help you.,They are a good company.,0.030014,0.030289,False,True
6,The people begin plead jesus to leave their re...,jesus is unable to leave.,jesus is unable to leave and begins to scream ...,0.017563,-0.031006,True,False
7,And we stayed couples years.,We were happy to be together.,We were happy with our decision.,0.046631,0.051208,False,True
8,I will have jerry get them.,Jerry is a good person.,Jerry is a good guy.,0.003344,0.01078,False,True
9,But you can not go walmart.,"You can not go to the store, you can not go to...","You can not go to the store, because you are n...",0.07373,0.050934,True,False
