# Autogenerate a sample JD

Let's try to generate a sample JD, JR based on the information that we have here, we will finetune a GPT3 with the JD and JRs that we have.
- Load up a dataset instance with all of the data
- Write up the trainer instance
- Train!
- Save model
- Test model

## Preprocessing the text files

In [None]:
from transformers import TextDataset, AutoTokenizer, AutoModel, pipeline

In [None]:
new_tokeniser = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
TRAIN_FILE_PATH = "./train_jd.txt"
TEST_FILE_PATH = "./test_jd.txt"

In [None]:
train_text = data_pd_no_dup[data_pd_no_dup["job_description"] != 'None']['job_description'].values

In [None]:
len(train_text)

826

In [None]:
train_text[:5]

array(['Develop targeted bespoke analytics models to help the venture extract value from, and monetise, data on our platforms, where such value extraction could include the likes of better client decisioning, pricing and opportunity identification..Research, design, implement and validate cutting-edge analytics and data visualization techniques to achieve targeted outcomes, such as bringing predicted outcomes closer to experience while ensuring consistency with the model ecosystem, while identifying opportunities for solutions to be leveraged across applications to broaden their scope of use and improve risk analysis..Collaborate with partner technology teams in setting up an effective model lifecycle platform on the cloud, with possible roles to play in cloud architecture and platform engineering..',
       'Engage business users to discover how knowledge graph and/or NLP can enable operations transformation, such as to enhance operation effectiveness, raise productivity and support d

In [None]:
train_text, test_text = train_test_split(train_text, test_size=0.3, random_state=RANDOM_STATE)

In [None]:
len(train_text)

578

In [None]:
len(test_text)

248

In [None]:
def write_text_files(file_path, files):
    with open(file_path, 'w', encoding='utf8') as writer:
        for line in files:
            writer.write(line)
            writer.write("\n")

write_text_files(TRAIN_FILE_PATH, train_text)

In [None]:
dataset = TextDataset()

## Finetuning

The finetuning of the model actually takes place in Google Colab because they have GPU there. 

Reference code: https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners

In [None]:
train_dataset = TextDataset(
    tokenizer= new_tokeniser,
    file_path=TRAIN_FILE_PATH,
    block_size=500
)

In [None]:
test_dataset = TextDataset(
    tokenizer=new_tokeniser,
    file_path=TEST_FILE_PATH,
    block_size=500
)



In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [None]:
# Parameters

CURR_DIR = os.getcwd()
DATA_DIR = os.path.join(CURR_DIR,'data')
TRAIN_FILE_PATH = os.path.join(DATA_DIR,"train_jd.txt")
TEST_FILE_PATH = os.path.join(DATA_DIR,"test_jd.txt")

train_file_path = "/content/drive/MyDrive/Articles.txt"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/result'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [None]:
# Train!
train(
    train_file_path=TRAIN_FILE_PATH,
    model_name=model_name,
    output_dir=CURR_DIR,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

## Inference

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/content/model"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
sequence = input() 
max_len = int(input()) 
generate_text(sequence, max_len) 

Check it out at huggingface youself!
https://huggingface.co/hashketh/gpt2-data-science-job-description?text=Job+Description