In [49]:
train_path = '../data/train_dataset.txt'
test_path = '../data/test_dataset.txt'

In [50]:
corpus = open('../data/PDS2.txt').read()

In [51]:
training_index = int(len(corpus) * .8)

In [52]:
with open(train_path, 'w') as train_file:
    train_file.write(corpus[:training_index])

In [53]:
with open(test_path, 'w') as train_file:
    train_file.write(corpus[training_index:])

In [54]:
print(f'Training Corpus Size: {len(corpus[:training_index])}')
print(f'Testing Corpus Size: {len(corpus[training_index:])}')

Training Corpus Size: 472508
Testing Corpus Size: 118128


In [55]:
from transformers import GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling

tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

train_dataset = TextDataset(
      tokenizer=tokenizer,
      file_path=train_path,
      block_size=128)

test_dataset = TextDataset(
      tokenizer=tokenizer,
      file_path=test_path,
      block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [121]:
print(tokenizer.decode(train_dataset[100]))

 it probably performed terribly on 80% of the forms! This is because the
computer was probably great at simple forms. The claims that would have taken a human
minutes to compute took the computer seconds. But these minutes add up, and before you
know it, each human is being saved over an hour a day!
Forms that might be easy for a human to read are also likely easy for the computer. It's
when the forms are very terse, or when the writer starts deviating from the usual grammar,
that the computer starts to fail. This model is great because it lets the humans spend more


In [122]:
print(tokenizer.decode(train_dataset[101]))


time on those difficult claims and gives them more attention without getting distracted by
the sheer volume of papers.
Note that I used the word "model." Remember that a model is a
relationship between elements. In this case, the relationship is between
written words and the approval status of a claim.

[ 24 ]

How to Sound Like a Data Scientist

Chapter 1

Case study – marketing dollars
A dataset shows the relationships between TV, radio, and newspaper sales. The goal is to
analyze the relationships between the three different marketing mediums and how they
affect the sale of


In [72]:
print(tokenizer.decode(train_dataset[1]))


Every effort has been made in the preparation of this book to ensure the accuracy of the information presented.
However, the information contained in this book is sold without warranty, either express or implied. Neither the
authors, nor Packt Publishing or its dealers and distributors, will be held liable for any damages caused or alleged to
have been caused directly or indirectly by this book.
Packt Publishing has endeavored to provide trademark information about all of the companies and products
mentioned in this book by the appropriate use of capitals. However, Packt Publishing cannot guarantee the accuracy
of this information.
Commissioning Editor: Amey


In [74]:
print(f'Training Dataset Size: {len(train_dataset)}')
print(f'Testing Dataset Size: {len(test_dataset)}')

Training Dataset Size: 920
Testing Dataset Size: 253


In [107]:
from transformers import pipeline
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('distilgpt2')

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='distilgpt2',
    config={'max_length': 200,  'do_sample': True, 'top_p': 0.9, 'temperatire': 0.7, 'top_k': 10}
)

loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /Users/sinanozdemir/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_

In [124]:
print('----------')
for generated_sequence in pretrained_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Using pad_token, but it is not set yet.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
A dataset shows the relationships between the number of users between individual users and their current active activity.



A representative survey found that 70 per cent of US women of the age of 18 had an active activity pattern between August 20 and 1 2014
----------
A dataset shows the relationships between two datasets to date.



References
(3) A Permanente, Y. (2016). The relationship between the model and the data. NBER Working Paper , Volume 2, Issue 2,
----------
A dataset shows the relationships between the largest sub-groups of children in the country and the most common type of school-aged primary education in the country. Data for the U.S. Census, 2009, and 2010 were presented in this report.
----------


In [68]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 10, # Number of update steps between two evaluations.
    warmup_steps=100,# number of warmup steps for learning rate scheduler,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [69]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 253
  Batch size = 64


{'eval_loss': 4.090301036834717,
 'eval_runtime': 78.1926,
 'eval_samples_per_second': 3.236,
 'eval_steps_per_second': 0.051}

In [75]:
trainer.train()

***** Running training *****
  Num examples = 920
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 58


Step,Training Loss
10,4.0163
20,3.9107
30,3.8353
40,3.6908
50,3.6577




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=58, training_loss=3.7913445439831963, metrics={'train_runtime': 1468.1333, 'train_samples_per_second': 1.253, 'train_steps_per_second': 0.04, 'total_flos': 60098252636160.0, 'train_loss': 3.7913445439831963, 'epoch': 2.0})

In [76]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 253
  Batch size = 64


{'eval_loss': 3.6889734268188477,
 'eval_runtime': 63.1809,
 'eval_samples_per_second': 4.004,
 'eval_steps_per_second': 0.063,
 'epoch': 2.0}

In [79]:
trainer.save_model()

Saving model checkpoint to ./gpt2_pds
Configuration saved in ./gpt2_pds/config.json
Model weights saved in ./gpt2_pds/pytorch_model.bin


In [109]:
from transformers import pipeline

loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200,  'do_sample': True, 'top_p': 0.9, 'temperatire': 0.7, 'top_k': 10}
)

loading configuration file ./gpt2_pds/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.10.0",
  "use

In [126]:
print('----------')
for generated_sequence in finetuned_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Using pad_token, but it is not set yet.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
A dataset shows the relationships between time, or the relationships between time and time, used to make a general rule. This rule makes sense if we apply other methods of estimating time and have an assumption that the distance between data points is different than that used
----------
A dataset shows the relationships between the average age of two children and their parents by age 15, and the average distance between each child and their parents in their neighborhood
by age 15 between their parents and their parents in their neighborhood
by age 15,
----------
A dataset shows the relationships between various characteristics, including:


(1) means that

(2) means both
and
(3) means that
(4) means that
(5) means that
(6)
----------
