In [1]:
import pandas as pd
from minsearch import Index

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 

torch.random.manual_seed(0)





<torch._C.Generator at 0x7bd00cdd8090>

In [3]:

df = pd.read_csv('../data/gold/data.csv')
documents = df.to_dict(orient='records')

documents[0]

{'id': 0,
 'chapter': 'CHAPTER 1',
 'title': 'Machine Learning Roles and the Interview Process',
 'section': 'Overview of This Book',
 'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the 

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.eval()




GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
prompt = """
You are an interviewer preparing for technical interviews for a data scientist position.
Your task is to generate exactly 5 interview questions based on the following text.
The output must be only the questions, don't write an introduction or any other extra text.

The record:

chapter: {chapter}
title: {title}
text: {text}


The questions have to follow the statements 
[Question 1: Question 2: Question 3:, Question 4: Question 5:]
"""

In [7]:
print(prompt.format(**documents[0]))


You are an interviewer preparing for technical interviews for a data scientist position.
Your task is to generate exactly 5 interview questions based on the following text.
The output must be only the questions, don't write an introduction or any other extra text.

The record:

chapter: CHAPTER 1
title: Machine Learning Roles and the Interview Process
text: In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often foc

In [8]:
def llm(prompt):
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True,  max_length=1023)
    with torch.no_grad():
        outputs = model.generate(**inputs, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1,max_new_tokens=50) 
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [9]:
print(llm(prompt))

"\nYou are an interviewer preparing for technical interviews for a data scientist position.\nYour task is to generate exactly 5 interview questions based on the following text.\nThe output must be only the questions, don't write an introduction or any other extra text.\n\nThe record:\n\nchapter: {chapter}\ntitle: {title}\ntext: {text}\n\n\nThe questions have to follow the statements \n[Question 1: Question 2: Question 3:, Question 4: Question 5:]\n[Question 2: Question 3: Question 4: Question 5:]\n[Question 3: Question 5: Question 6: Question 7: Question 8: Question 9: Question 10: Question 11: Question 12: Question 13: Question 14: Question 15"