In [None]:
!pip install gradio



In [None]:
with open("/content/Artificial Intelligence-Machine Learning Explained.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

In [None]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-zA-Z0-9.,!?;'\s]", "", text)
    return text.strip()

clean_text = preprocess_text(book_text)
print(clean_text[:500])


artificial intelligence machine learning explained author steve blank gordian knot center for national security innovation httpsgordianknot.stanford.edu artificial intelligencemachine learning explained ai is a onceina lifetime commercial and defense game changer hundreds of billions in public and private capital is being invested in ai and machine learning companies. the number of patents filed in 2021 is more than 30 times higher than in 2015 as companies and countries across the world have re


In [None]:
from collections import Counter

def create_vocab(text, vocab_size=5000):
    words = text.split()
    word_counts = Counter(words)
    vocab = {word: i for i, (word, _) in enumerate(word_counts.most_common(vocab_size))}
    return vocab

vocab = create_vocab(clean_text)
print(list(vocab.items())[:10])
# Print first 10 words and their IDs

[('and', 0), ('the', 1), ('of', 2), ('to', 3), ('a', 4), ('ai', 5), ('in', 6), ('learning', 7), ('for', 8), ('is', 9)]


In [None]:
pip install torch transformers datasets




In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token

with open("/content/Artificial Intelligence-Machine Learning Explained.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

tokens = tokenizer(book_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

print("Tokenized sample:", tokens.input_ids[0][:20])
# Show first 20 token IDs

Tokenized sample: tensor([  171,   119,   123,  8001,  9542,  9345,    14,   220,   198, 37573,
        18252,  5905,  1328,   220,   198,   198, 13838,    25,  6542, 31990])


In [None]:
from torch.utils.data import Dataset, DataLoader

class BookDataset(Dataset):
    def __init__(self, tokens):
        self.input_ids = tokens.input_ids
        self.attention_mask = tokens.attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx]
        }

dataset = BookDataset(tokens)

train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

print(f"Dataset size: {len(dataset)}")


Dataset size: 1


In [None]:
import torch
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded on:", device)


Model loaded on: cuda


In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

loss_fn = torch.nn.CrossEntropyLoss()




In [None]:
epochs = 20

for epoch in range(epochs):
    print(f"Epoch {epoch + 1} / {epochs}")
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Loss: {loss.item()}")


Epoch 1 / 20
Loss: 3.7080495357513428
Epoch 2 / 20
Loss: 3.181497573852539
Epoch 3 / 20
Loss: 2.8919897079467773
Epoch 4 / 20
Loss: 2.605741262435913
Epoch 5 / 20
Loss: 2.354064702987671
Epoch 6 / 20
Loss: 2.138347864151001
Epoch 7 / 20
Loss: 1.9382028579711914
Epoch 8 / 20
Loss: 1.7408156394958496
Epoch 9 / 20
Loss: 1.5506298542022705
Epoch 10 / 20
Loss: 1.3733503818511963
Epoch 11 / 20
Loss: 1.2083147764205933
Epoch 12 / 20
Loss: 1.0516281127929688
Epoch 13 / 20
Loss: 0.903261125087738
Epoch 14 / 20
Loss: 0.7596120238304138
Epoch 15 / 20
Loss: 0.6254317760467529
Epoch 16 / 20
Loss: 0.5005992650985718
Epoch 17 / 20
Loss: 0.3902042806148529
Epoch 18 / 20
Loss: 0.28684258460998535
Epoch 19 / 20
Loss: 0.19608695805072784
Epoch 20 / 20
Loss: 0.12787331640720367


In [None]:
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")

print("Model saved successfully! ✅")


Model saved successfully! ✅


In [None]:
from transformers import pipeline

qa_model = pipeline("text-generation", model="fine_tuned_gpt2", tokenizer="fine_tuned_gpt2")

question = "The Department of Defense has thought that AI"
response = qa_model(question, max_length=50, do_sample=True)

print(response[0]['generated_text'])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


The Department of Defense has thought that AI will be a critical element in the ongoing defense-academic collaboration and that such a technology could potentially change the balance in the war arena. Today, however, AI is one of the more complex and thought-


In [None]:
import gradio as gr
from transformers import pipeline

def generate_text(prompt):
    qa_model = pipeline("text-generation", model="fine_tuned_gpt2", tokenizer="fine_tuned_gpt2")
    response = qa_model(prompt, max_length=50, do_sample=True)
    return response[0]['generated_text']

iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(placeholder="Enter your text here..."),
    outputs="text",
    title="Fine-Tuned GPT-2 Text Generator",
    description="Enter a prompt and generate AI-driven text completions using your fine-tuned GPT-2 model."
)

iface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3e3d6ea6cfc1a24943.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


