In [1]:
import os
import psycopg2
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
import torch
import numpy as np
from sklearn.model_selection import train_test_split

# --- Data Preprocessing ---

def get_data():
    conn = psycopg2.connect(
        "dbname=interviewhell",
        user="postgres",
        password="zaq12WSX",
        host="localhost",
        port="5432"
    )
    cursor = conn.cursor()
    cursor.execute("select joboffers.title, categories.name, questions.text from questions, joboffers, categories;")
    result = cursor.fetchall()
    conn.close()
    
    df = pd.DataFrame(result, columns=['job_title', 'category', 'question'])
    return df

df = get_data()

df = df.drop_duplicates(subset=['question'])

def format_prompt(row):
    return f"Job Title: {row['job_title']}, \nCategory: {row['category']}. \n### Instruction: Generate an interview question for the given job title and category. \n### Response: {row['question']}"


df['prompt'] = df.apply(format_prompt, axis=1)

# Split the dataset
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['category'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['category'])

# --- Model and Tokenization ---
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["prompt"],
        max_length=128,
        padding="max_length",
        truncation=True
    )
    model_inputs["labels"] = tokenizer(
        examples["question"],
        max_length=128,
        padding="max_length",
        truncation=True
    )["input_ids"]
    return model_inputs

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# --- Training ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Not using masked language modeling
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_steps=10,  # Log loss every 10 steps
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=4,  # Adjust batch size if needed
    per_device_eval_batch_size=4,
    weight_decay=0.01,  # Helps prevent overfitting
    load_best_model_at_end=True,
    metric_for_best_model="loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

# --- Inference Function ---
def generate_question(job_title, category):
    prompt = f"Job Title: {job_title}, Category: {category}. Generate a relevant interview question."
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_length=50, num_return_sequences=1, temperature=0.7, top_k=50)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
print(generate_question("Torture Tester", "Technical"))


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 36/36 [00:00<00:00, 2882.63 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 1079.23 examples/s]
  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,2.220159
2,3.497800,1.115236
3,1.538200,0.981234
4,0.942000,0.904635
5,0.786400,0.865809
6,0.640600,0.837377
7,0.545600,0.814699
8,0.470700,0.798863
9,0.460800,0.795104
10,0.426000,0.794932


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Job Title: Torture Tester, Category: Technical. Generate a relevant interview question. Generate an interview question for the given job title and category. Generate an interview question for the given job title and category. Generate an interview question


In [2]:
print(generate_question("Torture Tester", "Technical"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Job Title: Torture Tester, Category: Technical. Generate a relevant interview question. Generate an interview question for the given job title and category. Generate an interview question for the given job title and category. Generate an interview question
