<a href="https://colab.research.google.com/github/Cchancee/careerInsightBot/blob/main/entry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install transformers datasets

# Import important libraries

In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import random

#Load the dataset

In [None]:
df = pd.read_csv("/content/Career QA Dataset.csv")
df.head()
df.shape

(1620, 3)

In [None]:
df = df.sample(500, random_state=42)
df.shape

(500, 3)

# Check if the dataset have any missing values

In [None]:
print(df.isnull().any())

role        False
question    False
answer      False
dtype: bool


# Convert Q&A pairs into dataset

In [None]:
texts = []

for _, row in df.iterrows():
    # Combine role, question, and answer into a single training text
    texts.append(
        f"Role: {row['role']}\nQuestion: {row['question']}\nAnswer: {row['answer']}\n"
    )

dataset = Dataset.from_dict({"text": texts})
print(dataset[0])

{'text': 'Role: AI Researcher\nQuestion: What does a typical day look like for an AI Researcher?\nAnswer: A typical day involves designing experiments, writing and testing code, analyzing results, and reading academic papers. AI Researchers also spend time collaborating with other researchers or teams.\n'}


# Load tokenizer & model

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# GPT2 does not have pad token by default; set it
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Tokenize dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

# Data collator

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training setup

In [None]:
training_args = TrainingArguments(
    output_dir="./career_bot_gpt2NewEntry",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=100,
    save_total_limit=2,
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


# Fine-tune

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.5639
100,1.6454
150,1.3263




TrainOutput(global_step=189, training_loss=1.7127627418154763, metrics={'train_runtime': 6936.8596, 'train_samples_per_second': 0.216, 'train_steps_per_second': 0.027, 'total_flos': 195969024000000.0, 'train_loss': 1.7127627418154763, 'epoch': 3.0})

# Save the fine-tuned model

In [None]:
model.save_pretrained("./career_bot_gpt2NewEntry")
tokenizer.save_pretrained("./career_bot_gpt2NewEntry")

('./career_bot_gpt2NewEntry/tokenizer_config.json',
 './career_bot_gpt2NewEntry/special_tokens_map.json',
 './career_bot_gpt2NewEntry/vocab.json',
 './career_bot_gpt2NewEntry/merges.txt',
 './career_bot_gpt2NewEntry/added_tokens.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/career_bot_gpt2NewEntry /content/drive/MyDrive/


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Prediction

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/drive/MyDrive/career_bot_gpt2NewEntry"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)


In [None]:
persona = (
    "You are Kazi, a friendly, and brutally honest career coach for creatives. "
    "You speak casually, and give practical advice. "
)


In [None]:
def chat(question, max_length=400):
    prompt = f"{persona}\nUser: {question}\nKazi:"

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.8,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        top_p=0.9,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reply = response.split("Kazi:")[-1].strip()
    return reply


In [None]:
user_question = "What is the role of data analyst"
print(chat(user_question))


Data analysts work closely with brands to analyze user behavior in various industries including finance; marketing/advertising, e-commerce, advertising platforms, tech companies, media organizations…I do my best to provide insights that can help startups grow or improve their businesses by optimizing ad performance across different content channels."


In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Reference answer from your dataset
reference = [
    "Essential skills include proficiency in SQL, Excel, Python or R, data visualization tools like Tableau, and strong analytical thinking."
]

# Tokenize reference into list of words
reference_tokens = [ref.split() for ref in reference]

# Generated answer from your model
generated = chat("What skills are required to become a Data Analyst?")
generated_tokens = generated.split()

# Calculate BLEU
score = sentence_bleu(reference_tokens, generated_tokens)
print("BLEU score:", score)



BLEU score: 2.0449263018643206e-155
