In [52]:
import pandas as pd
import random

# Generate 50 random examples
data = {
    "Annual kilometers": [f"{random.randint(0, 20000)} kms" for _ in range(50)],
    "Liability Premium": [random.randint(100, 500) for _ in range(50)]
}

# Create a DataFrame
df = pd.DataFrame(data)
df.to_csv('liability_premium_dataset.csv', index=False)

# Display the DataFrame
print(df)


   Annual kilometers  Liability Premium
0           3648 kms                250
1            819 kms                421
2           9012 kms                416
3           8024 kms                285
4           7314 kms                395
5           4572 kms                198
6           3358 kms                460
7          17870 kms                135
8           2848 kms                123
9          19349 kms                438
10         13825 kms                216
11          1041 kms                495
12           976 kms                248
13          3070 kms                140
14          7164 kms                219
15          7623 kms                151
16         16559 kms                294
17         19726 kms                242
18           869 kms                332
19         18390 kms                425
20          6515 kms                286
21         17856 kms                183
22         13746 kms                289
23          7223 kms                281


In [53]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

# Load the dataset from CSV
dataset = Dataset.from_csv('liability_premium_dataset.csv')

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples['Annual kilometers'], padding="max_length", truncation=True)
    labels = examples['Liability Premium']
    inputs['labels'] = labels
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [54]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Split the dataset into training and evaluation sets
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [55]:
# Train the model
trainer.train()

  0%|          | 0/15 [00:00<?, ?it/s]

: 

In [51]:
# Evaluate the model
results = trainer.evaluate()
print(results)


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.86316978931427, 'eval_runtime': 0.2442, 'eval_samples_per_second': 4.095, 'eval_steps_per_second': 4.095, 'epoch': 3.0}
