In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/conversational-data/convo/conversational_dataset.txt
/kaggle/input/conversational-data/convo/pocketGPT-27M/config.json
/kaggle/input/conversational-data/convo/pocketGPT-27M/merges.txt
/kaggle/input/conversational-data/convo/pocketGPT-27M/training_args.bin
/kaggle/input/conversational-data/convo/pocketGPT-27M/tokenizer.json
/kaggle/input/conversational-data/convo/pocketGPT-27M/vocab.json
/kaggle/input/conversational-data/convo/pocketGPT-27M/tokenizer_config.json
/kaggle/input/conversational-data/convo/pocketGPT-27M/model.safetensors
/kaggle/input/conversational-data/convo/pocketGPT-27M/special_tokens_map.json
/kaggle/input/conversational-data/convo/pocketGPT-27M/generation_config.json


In [None]:
# ---------------------------------------------------------
# FIX ENVIRONMENT BEFORE ANYTHING ELSE
# ---------------------------------------------------------

# 1. Fix protobuf issue (transformers bug on Kaggle)
!pip install protobuf==4.25.3 --quiet --force-reinstall

# 2. Remove TensorFlow to avoid cuDNN/cuFFT/cuBLAS spam + protobuf conflict
!pip uninstall -y tensorflow tensorflow-cpu tensorflow-gpu keras

# 3. Fix pyarrow to correct version (Kaggle uses pyarrow-19.x)
!pip install pyarrow==19.0.0 --quiet --force-reinstall --no-deps

# 4. Install ONLY the necessary libraries WITHOUT upgrading pyarrow again
!pip install transformers==4.53.3 datasets==2.19.1 accelerate tokenizers==0.21.2 --quiet

In [2]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

MODEL_DIR = "/kaggle/input/conversational-data/convo/pocketGPT-27M"  

tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_DIR)

tokenizer.pad_token = "<|pad|>"
tokenizer.bos_token = "<|bos|>"
tokenizer.eos_token = "<|eos|>"
tokenizer.unk_token = "<|unk|>"

model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)
model.resize_token_embeddings(len(tokenizer))

print("Tokenizer size:", tokenizer.vocab_size)
print("Model loaded.")


Tokenizer size: 24000
Model loaded.


In [3]:
from datasets import load_dataset

DATA_PATH = "/kaggle/input/conversational-data/convo/conversational_dataset.txt"

dataset = load_dataset("text", data_files={"train": DATA_PATH})
dataset = dataset["train"].train_test_split(test_size=0.02, seed=42)

print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 171321
    })
    test: Dataset({
        features: ['text'],
        num_rows: 3497
    })
})


In [4]:
CONTEXT_WINDOW = 384

def tokenize(batch):
    return tokenizer(
        batch["text"],
        max_length=CONTEXT_WINDOW,
        truncation=True,
        padding="max_length"
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"]
)

print(tokenized)

Map:   0%|          | 0/171321 [00:00<?, ? examples/s]

Map:   0%|          | 0/3497 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 171321
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3497
    })
})


In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [14]:
training_args = TrainingArguments(
    output_dir="./QNA_pocketGPT_27M",
    overwrite_output_dir=True,

    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,

    learning_rate=1e-4,
    warmup_steps=200,
    weight_decay=0.1,

    logging_steps=200,

    eval_strategy="steps",
    eval_steps=500,
    save_strategy="epoch",

    fp16=True,
    bf16=False,

    report_to="none"
)


In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"]
)

In [16]:
trainer.train()

Step,Training Loss,Validation Loss
500,1.8114,3.674355
1000,1.7356,3.526501
1500,1.7156,3.4459
2000,1.6394,3.38237
2500,1.6204,3.326509
3000,1.626,3.279655
3500,1.5618,3.24662
4000,1.5766,3.211287
4500,1.6201,3.18435
5000,1.6011,3.158545




TrainOutput(global_step=16062, training_loss=1.4993742799242353, metrics={'train_runtime': 8430.2626, 'train_samples_per_second': 60.966, 'train_steps_per_second': 1.905, 'total_flos': 2.1013593135906816e+16, 'train_loss': 1.4993742799242353, 'epoch': 3.0})

In [19]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(24000, 384)
    (wpe): Embedding(384, 384)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-9): 10 x GPT2Block(
        (ln_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=1152, nx=384)
          (c_proj): Conv1D(nf=384, nx=384)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=1536, nx=384)
          (c_proj): Conv1D(nf=384, nx=1536)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=384, out_features=24000, bias=False)
)

In [70]:
import torch
def ask(prompt):
    formatted = f"<|bos|>Instruction: {prompt}\nResponse:"
    
    inputs = tokenizer.encode(formatted, return_tensors="pt")
    inputs = inputs.to(model.device) 

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=384,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

    print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [71]:
ask("Explain model overfitting")

Instruction: Explain model overfitting
Response: “The only way to reduce the number of mistakes in the code is to use a function to fix it.”


In [72]:
ask("What is a GPT?")

Instruction: What is a GPT?
Response: The two most powerful models of computer science are GPT and other models. GPT is a transformer-based model for understanding and solving language problems.


In [77]:
ask("what is Linear Regression Algorith in Machine Learning?")

Instruction: what is Linear Regression Algorith in Machine Learning?
Response: A supervised machine learning algorithm that can be used to predict the output of a given data set. Output is the best predictions of the data set. Output can be classified as either a classification problem or a regression problem.


In [74]:
ask("what is an Artificial Neural Network?")

Instruction: what is an Artificial Neural Network?
Response: The neural network is designed for the following task. You need to process data, use the inputs, and make predictions. This is done by creating a neural network architecture that is able to perform the task. Once the input is properly trained, you can use it for a variety of tasks.  The neural network architecture is able to learn the patterns and behaviors of the data. The results of the neural network architecture are based on a combination of input, output, and output variables.


In [60]:
import shutil
from IPython.display import FileLink

trainer.save_model("./QNA_pocketGPT_27M")
tokenizer.save_pretrained("./QNA_pocketGPT_27M")

shutil.make_archive("QNA_pocketGPT_27M", "zip", "./QNA_pocketGPT_27M")

FileLink("QNA_pocketGPT_27M.zip")
