# Installation

In [1]:
!pip install transformers



In [39]:
!pip3 install --upgrade transformers




In [46]:
!pip3 show transformers

Name: transformers
Version: 4.34.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /Users/macbookpro/anaconda3/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers


# Setup

In [7]:
import pandas as pd
import numpy as np
import re

In [40]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast, GPT2TokenizerFast

ImportError: cannot import name 'DataCollatorForQuestionAnswering' from 'transformers' (/Users/macbookpro/anaconda3/lib/python3.10/site-packages/transformers/__init__.py)

# Full-text Training

In [8]:
#Read Full Text Data
df = pd.read_csv('data/nike_data_fulltext.csv')
data = df["Text"].str.cat(sep='\n')
text_data = re.sub(r'\n+', '\n', data).strip()  # Remove excess newline characters
text_data

'Nike, Inc.(stylized as NIKE) is an American athletic footwear and apparel corporation headquartered near Beaverton, Oregon, United States. It is the world\'s largest supplier of athletic shoes and apparel and a major manufacturer of sports equipment, with revenue in excess of US$46 billion in its fiscal year 2022\nThe company was founded on January 25, 1964, as "Blue Ribbon Sports", by Bill Bowerman and Phil Knight, and officially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory. Nike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force 1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, Nike CR7, and subsidiaries including Air Jordan and Converse (brand). Nike also owned Bauer Hockey from 1995 to 2008, and previously owned Cole Haan, Umbro, and Hurley International. In addition to manufacturing sportswear and equipment, the company operates retail stores under the N

In [10]:
#Prepare the full text training data
with open("data/train_fulltext.txt", "w") as f:
    f.write(text_data)

In [11]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [12]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [13]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)
      
    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
                      output_dir=output_dir,
                      overwrite_output_dir=overwrite_output_dir,
                      per_device_train_batch_size=per_device_train_batch_size,
                      num_train_epochs=num_train_epochs,
                    )

    trainer = Trainer(
                  model=model,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=train_dataset,
              )
      
    trainer.train()
    trainer.save_model()

In [16]:
# Configuration
train_file_path = "data/train_fulltext.txt"
model_name = 'gpt2'
output_dir = 'model/model_fulltext'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000

In [18]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

loading file vocab.json from cache at /Users/macbookpro/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/vocab.json
loading file merges.txt from cache at /Users/macbookpro/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /Users/macbookpro/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to model/model_fulltext
Configuration saved in model/model_fulltext/config.json
Model weights saved in model/model_fulltext/pytorch_model.bin


# Generate Sample

In [19]:
# Functions
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [55]:
#Config
model1_path = 'model/model_fulltext'
sequence1 = "[Q] When was the company founded?"
max_len = 50
generate_text(model1_path, sequence1, max_len) 

loading configuration file model/model_fulltext/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading 

[Q] When was the company founded? [A] August 19, 1997 [b] When was the company founded? [A] September 13, 2003 [b] When was the company founded? [A] October 1, 2001 [b


# Fine-Tune with Q and A format

In [24]:
#Prepare the data
df = pd.read_csv('data/nike_data_qanda.csv')
data = df["Text"].str.cat(sep='\n')
text_data = re.sub(r'\n+', '\n', data).strip()  # Remove excess newline characters

with open("data/finetune_qanda.txt", "w") as f:
    f.write(text_data)

In [32]:
def fine_tune(finetune_file_path,model_name,
              model_path,
              output_dir,
              overwrite_output_dir,
              per_device_train_batch_size,
              num_train_epochs,
              save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(finetune_file_path, tokenizer)
    data_collator = load_data_finetuned_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)
      
    model = load_model(model_path)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
                      output_dir=output_dir,
                      overwrite_output_dir=overwrite_output_dir,
                      per_device_train_batch_size=per_device_train_batch_size,
                      num_train_epochs=num_train_epochs,
                    )

    trainer = Trainer(
                  model=model,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=train_dataset,
              )
      
    trainer.train()
    trainer.save_model()

In [47]:
def load_data_finetuned_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [48]:
# Configuration
finetune_file_path = "data/finetune_qanda.txt"
model_path = 'model/model_fulltext'
output_dir = 'model/model_finetuned_qanda'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000

In [49]:
# Fine-Tune
fine_tune(
    finetune_file_path=finetune_file_path,
    model_name=model_name,
    model_path=model_path, 
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

loading file vocab.json from cache at /Users/macbookpro/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/vocab.json
loading file merges.txt from cache at /Users/macbookpro/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /Users/macbookpro/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to model/model_finetuned_qanda
Configuration saved in model/model_finetuned_qanda/config.json
Model weights saved in model/model_finetuned_qanda/pytorch_model.bin


In [53]:
model2_path = 'model/model_finetuned_qanda'
sequence2 = "[Q] Who is Nike CEO?"
max_len = 30
generate_text(model2_path, sequence2, max_len) 

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/macbookpro/anaconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/06/g26v_t9960g7byk34gcmfn7h0000gn/T/ipykernel_9659/393270789.py", line 4, in <module>
    generate_text(model2_path, sequence2, max_len)
  File "/var/folders/06/g26v_t9960g7byk34gcmfn7h0000gn/T/ipykernel_9659/1148284488.py", line 13, in generate_text
    model = load_model(model_path)
  File "/var/folders/06/g26v_t9960g7byk34gcmfn7h0000gn/T/ipykernel_9659/1148284488.py", line 3, in load_model
    model = GPT2LMHeadModel.from_pretrained(model_path)
  File "/Users/macbookpro/anaconda3/lib/python3.10/site-packages/transformers/modeling_utils.py", line 1966, in from_pretrained
  File "/Users/macbookpro/anaconda3/lib/python3.10/site-packages/transformers/configuration_utils.py", line 532, in from_pretrained
    'http://hostname': 'foo.bar:4012'}.` The proxies ar

In [57]:
df = pd.read_csv('data/fullText/nike_data_fulltext.csv')
data = df["Text"].str.cat(sep='\n')
zz = re.sub(r'\n+', '\n', data).strip()  # Remove excess newline characters
zz

'Nike, Inc.(stylized as NIKE) is an American athletic footwear and apparel corporation headquartered near Beaverton, Oregon, United States. It is the world\'s largest supplier of athletic shoes and apparel and a major manufacturer of sports equipment, with revenue in excess of US$46 billion in its fiscal year 2022\nThe company was founded on January 25, 1964, as "Blue Ribbon Sports", by Bill Bowerman and Phil Knight, and officially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory. Nike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force 1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, Nike CR7, and subsidiaries including Air Jordan and Converse (brand). Nike also owned Bauer Hockey from 1995 to 2008, and previously owned Cole Haan, Umbro, and Hurley International. In addition to manufacturing sportswear and equipment, the company operates retail stores under the N

In [62]:
# Remove unwanted characters
cleaned_data = zz.replace('\n', ' ').replace('\'', '')
#cleaned_data
# Split the text into words
word_array = cleaned_data.split()
print(len(word_array))
# Convert the word_array into a set to get unique values
unique_words = set(word_array)

# Calculate the number of unique words
unique_word_count = len(unique_words)
unique_word_count

7097


2780