In [None]:
!pip3 install torch torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

# Sample text data
text = """GUVI, founded with the mission to democratize technical education, is an innovative edtech platform designed to offer high-quality, accessible online technology courses. The platform stands out by providing education in multiple native languages, making it easier for non-English speakers to learn coding and technology skills.
GUVI's mission is to bridge the gap between education and employment by offering affordable, quality education that is accessible to all. Their vision is to empower individuals from diverse backgrounds with the technical skills needed to succeed in the modern workforce. By removing language barriers and providing practical, hands-on learning experiences, GUVI aims to create a global community of skilled professionals.Zen Class is GUVI's flagship program that offers immersive, job-oriented training in various technology domains. The program is designed to provide hands-on experience and mentorship, ensuring that learners are job-ready upon completion.GUVI stands as a testament to the power of accessible and inclusive education. By providing high-quality technical education in native languages, GUVI is breaking down barriers and creating opportunities for learners worldwide. Whether through their comprehensive courses, interactive platforms like Codekata and Webkata, or innovative programs like Zen Class, GUVI is committed to empowering the next generation of tech professionals.
GUVI stands as a testament to the power of accessible and inclusive education. By providing high-quality technical education in native languages, GUVI is breaking down barriers and creating opportunities for learners worldwide. Whether through their comprehensive courses, interactive platforms like Codekata and Webkata, or innovative programs like Zen Class, GUVI is committed to empowering the next generation of tech professionals.
GUVI's innovative approach to education, combined with its focus on inclusivity and quality, has made it a significant player in the EdTech industry. By offering courses in multiple languages, providing hands-on learning experiences, and ensuring industry relevance, GUVI continues to empower learners and professionals to excel in the technology sector.
Codekata is an online platform within GUVI where learners can practice coding problems and improve their problem-solving skills. It offers a variety of challenges that range from beginner to advanced levels, helping students to:
Develop logical thinking and coding proficiency.
Prepare for coding interviews and competitive programming contests.
Gain practical experience through hands-on problem-solving."""

# Tokenize the text
tokenizer = get_tokenizer('basic_english')
tokens = tokenizer(text.lower())

# Build the vocabulary
counter = Counter(tokens)
vocab = build_vocab_from_iterator([tokens], specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

# Numericalize the text
data = [vocab[token] for token in tokens]

# Convert data to tensors and create batches
seq_length = 30
def create_batches(data, seq_length):
    sequences = [data[i:i+seq_length] for i in range(0, len(data)-seq_length)]
    inputs = torch.tensor([seq[:-1] for seq in sequences], dtype=torch.long)
    targets = torch.tensor([seq[-1] for seq in sequences], dtype=torch.long)
    return inputs, targets

inputs, targets = create_batches(data, seq_length)
train_data, val_data = inputs[:int(0.8*len(inputs))], inputs[int(0.8*len(inputs)):]
train_targets, val_targets = targets[:int(0.8*len(targets))], targets[int(0.8*len(targets)):]

# DataLoader
train_dataset = torch.utils.data.TensorDataset(train_data, train_targets)
val_dataset = torch.utils.data.TensorDataset(val_data, val_targets)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=20, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=20)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




In [None]:
import torch.nn as nn
import torch.optim as optim

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize the model, criterion, and optimizer
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 100
output_dim = vocab_size

model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())


In [None]:
def train_epoch(loader, model, criterion, optimizer):
    model.train()
    total_loss = 0
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(loader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            output = model(inputs)
            loss = criterion(output, targets)
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, 15):
    train_loss = train_epoch(train_loader, model, criterion, optimizer)
    val_loss = evaluate(val_loader, model, criterion)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Epoch 1, Train Loss: 1.3737, Val Loss: 5.4773
Epoch 2, Train Loss: 1.2230, Val Loss: 5.5310
Epoch 3, Train Loss: 1.0421, Val Loss: 5.5547
Epoch 4, Train Loss: 0.9228, Val Loss: 5.5912
Epoch 5, Train Loss: 0.7977, Val Loss: 5.6246
Epoch 6, Train Loss: 0.7423, Val Loss: 5.6617
Epoch 7, Train Loss: 0.6910, Val Loss: 5.6655
Epoch 8, Train Loss: 0.5697, Val Loss: 5.7181
Epoch 9, Train Loss: 0.5146, Val Loss: 5.7437
Epoch 10, Train Loss: 0.4490, Val Loss: 5.7695
Epoch 11, Train Loss: 0.4023, Val Loss: 5.8267
Epoch 12, Train Loss: 0.3772, Val Loss: 5.8349
Epoch 13, Train Loss: 0.3277, Val Loss: 5.8572
Epoch 14, Train Loss: 0.3022, Val Loss: 5.8813


In [None]:
def generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0):
    model.eval()
    tokens = tokenizer(seed_text.lower())
    input_ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = seed_text
    with torch.no_grad():
        for _ in range(next_words):
            output = model(input_ids)
            output = output.squeeze(0)  # Remove the batch dimension
            output = output / temperature
            probabilities = torch.nn.functional.softmax(output, dim=-1)
            next_token_id = torch.multinomial(probabilities, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)
            generated_text += ' ' + next_token
            next_input = torch.tensor([[next_token_id]], dtype=torch.long).to(device)
            input_ids = torch.cat((input_ids, next_input), dim=1)

    return generated_text

seed_text = "GUVI is"
generated_text = generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=0.7)
print(generated_text)


GUVI is committed to empowering the next generation of tech professionals . guvi stands as a testament to the power of accessible and inclusive education . by providing high-quality technical education in native languages , guvi is breaking down barriers and creating opportunities for learners worldwide . whether through their comprehensive courses


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize the LSTM model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the LSTM model
for epoch in range(1, 15):
    train_loss = train_epoch(train_loader, model, criterion, optimizer)
    val_loss = evaluate(val_loader, model, criterion)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Epoch 1, Train Loss: 5.0773, Val Loss: 5.0792
Epoch 2, Train Loss: 4.8770, Val Loss: 5.0834
Epoch 3, Train Loss: 4.5805, Val Loss: 5.3581
Epoch 4, Train Loss: 4.0731, Val Loss: 5.3802
Epoch 5, Train Loss: 3.6619, Val Loss: 5.6114
Epoch 6, Train Loss: 3.2762, Val Loss: 5.8233
Epoch 7, Train Loss: 2.9726, Val Loss: 5.7074
Epoch 8, Train Loss: 2.6050, Val Loss: 6.0344
Epoch 9, Train Loss: 2.3269, Val Loss: 5.9016
Epoch 10, Train Loss: 2.0446, Val Loss: 6.0791
Epoch 11, Train Loss: 1.7882, Val Loss: 6.1225
Epoch 12, Train Loss: 1.6151, Val Loss: 6.0641
Epoch 13, Train Loss: 1.4262, Val Loss: 6.2762
Epoch 14, Train Loss: 1.2739, Val Loss: 6.1950


In [None]:
def generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0):
    model.eval()
    tokens = tokenizer(seed_text.lower())
    input_ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = seed_text
    with torch.no_grad():
        for _ in range(next_words):
            output = model(input_ids)
            output = output.squeeze(0)  # Remove the batch dimension
            output = output / temperature
            probabilities = torch.nn.functional.softmax(output, dim=-1)
            next_token_id = torch.multinomial(probabilities, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)
            generated_text += ' ' + next_token
            next_input = torch.tensor([[next_token_id]], dtype=torch.long).to(device)
            input_ids = torch.cat((input_ids, next_input), dim=1)

    return generated_text

seed_text = "Codekata"
generated_text = generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0)
print(generated_text)

Codekata removing barriers and ensuring experiences job-ready webkata combined advanced community and s of class . providing hands-on learning on experiences testament , and platforms practical . , innovative learning a , to individuals empowering in experience multiple languages and providing hands-on languages , ensuring it quality hands-on s . whether


In [None]:
import torch.nn as nn
import torch.optim as optim
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, nhead, num_encoder_layers, hidden_dim, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim)
        encoder_layers = nn.TransformerEncoderLayer(embedding_dim, nhead, hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x) * math.sqrt(embedding_dim)
        embedded = self.pos_encoder(embedded)
        output = self.transformer_encoder(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize the model, criterion, and optimizer
vocab_size = len(vocab)
embedding_dim = 200
nhead = 2
num_encoder_layers = 2
hidden_dim = 200
output_dim = vocab_size

model = TransformerModel(vocab_size, embedding_dim, nhead, num_encoder_layers, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())




In [None]:
def train_epoch(loader, model, criterion, optimizer):
    model.train()
    total_loss = 0
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(loader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            output = model(inputs)
            loss = criterion(output, targets)
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, 15):
    train_loss = train_epoch(train_loader, model, criterion, optimizer)
    val_loss = evaluate(val_loader, model, criterion)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Epoch 1, Train Loss: 1.1442, Val Loss: 6.9056
Epoch 2, Train Loss: 1.1858, Val Loss: 6.9694
Epoch 3, Train Loss: 1.0719, Val Loss: 7.0800
Epoch 4, Train Loss: 1.1453, Val Loss: 7.1768
Epoch 5, Train Loss: 1.1096, Val Loss: 7.3371
Epoch 6, Train Loss: 1.0951, Val Loss: 7.4274
Epoch 7, Train Loss: 1.0772, Val Loss: 7.4718
Epoch 8, Train Loss: 0.9924, Val Loss: 7.4860
Epoch 9, Train Loss: 1.1151, Val Loss: 7.5202
Epoch 10, Train Loss: 0.9903, Val Loss: 7.4964
Epoch 11, Train Loss: 1.0484, Val Loss: 7.5559
Epoch 12, Train Loss: 1.0425, Val Loss: 7.5296
Epoch 13, Train Loss: 0.9883, Val Loss: 7.5312
Epoch 14, Train Loss: 0.9743, Val Loss: 7.6489


In [None]:
def generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0):
    model.eval()
    tokens = tokenizer(seed_text.lower())
    input_ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = seed_text
    with torch.no_grad():
        for _ in range(next_words):
            output = model(input_ids)
            output = output.squeeze(0)  # Remove the batch dimension
            output = output / temperature
            probabilities = torch.nn.functional.softmax(output, dim=-1)
            next_token_id = torch.multinomial(probabilities, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)
            generated_text += ' ' + next_token
            next_input = torch.tensor([[next_token_id]], dtype=torch.long).to(device)
            input_ids = torch.cat((input_ids, next_input), dim=1)

    return generated_text

seed_text = "GUVI is"
generated_text = generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=0.6)
print(generated_text)

GUVI is committed to empowering the next generation of tech professionals . whether through their comprehensive courses in the gap between education is breaking down barriers and inclusive education and inclusive education in multiple native languages , hands-on learning experiences , or innovative programs like zen class , ensuring that offers immersive


In [None]:
!pip install -U transformers huggingface_hub


Collecting transformers
  Downloading transformers-4.42.2-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed transformers-4.42.2


In [None]:
!pip uninstall transformers -y
!pip install transformers


Found existing installation: transformers 4.42.2
Uninstalling transformers-4.42.2:
  Successfully uninstalled transformers-4.42.2
Collecting transformers
  Using cached transformers-4.42.2-py3-none-any.whl (9.3 MB)
Installing collected packages: transformers
Successfully installed transformers-4.42.2


In [None]:
!pip install huggingface_hub




In [None]:
from huggingface_hub import login

login('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')


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")

input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Specify the model name or identifier
model_name = "gpt2"  # You can use other models like "gpt2-medium", "gpt2-large", "gpt2-xl", "gpt-neo-125M", etc.

# Load the pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
prompt = "GUVI"
generated_text = text_generator(prompt, max_length=100, do_sample=True, temperature=0.7)

print(generated_text[0]['generated_text'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GUVIAN O'DONNELL,

"I will be there," he cried. "I have just got a very good idea. One day I shall have this little boat in my hands."

"I will," said Mrs. Macaulay, "but I shall take my time. I will wait for you."

"How long it will take, I can't tell you. I've got to wait for it to grow on my son, and I can


In [None]:
pip install transformers datasets torch fastapi uvicorn

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/1

In [None]:
import os
import re
from transformers import GPT2Tokenizer

def print_raw_data(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            print(line.strip())

input_file = "/content/Guvi - Cleaned (2).txt"  # Your company-specific data file
print_raw_data(input_file)



﻿About Guvi


Guvi Geek Networks Private Limited, commonly known as GUVI, is an Indian technology company that focuses on providing skill development and training in various domains related to information technology and computer science. Here are some key details about GUVI:


Founding and Background
GUVI was founded with the mission of bridging the skill gap in the IT industry by offering practical, industry-relevant training.
The company was established to cater to the growing demand for skilled professionals in emerging technologies like artificial intelligence, machine learning, data science, and programming.


Educational Platform
GUVI operates an online learning platform that offers courses, tutorials, and projects designed to enhance technical skills.
It focuses on providing hands-on learning experiences through coding challenges, real-world projects, and interactive sessions.


Courses and Content
GUVI offers a wide range of courses covering programming languages such as Python

In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/309.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [None]:
import accelerate
print(accelerate.__version__)


0.31.0


In [None]:
!pip install accelerate>=0.21.0


In [None]:
pip install transformers[torch]



In [None]:
!pip show accelerate

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Input file path (replace with your actual input file)
input_file = "/content/Guvi - Cleaned (2).txt"

# Function to load dataset
def load_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128,  # Adjust block size as needed
    )

# Create dataset using raw text file
train_dataset = load_dataset(input_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Step,Training Loss


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [None]:
#!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name_or_path = "/content/fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

# Test the model
seed_text = "Guvi"
generated_texts = generate_text(model, tokenizer, seed_text, max_length=20, temperature=0.7, num_return_sequences=3)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1:
Guvi-based IT companies that offer education in the IT industry. Their platform includes over 1,

Generated Text 2:
Guvi), provides a comprehensive guide to the various types of learning, covering topics such as coding,

Generated Text 3:
Guvi) and the web-based platform, which allows users to create their own profiles and share



In [None]:
!pip install transformers fastapi uvicorn nest-asyncio pyngrok

Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [None]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import nest_asyncio
from pyngrok import ngrok
import uvicorn

app = FastAPI()

# Load the fine-tuned model and tokenizer
model_name_or_path = "./fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class Query(BaseModel):
    text: str

@app.post("/generate/")
async def generate_text(query: Query):
    input_ids = tokenizer.encode(query.text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return {"generated_text": generated_text}

@app.get("/generatee/")
async def generate_text(text: str):
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return {"generated_text": generated_text}

# Authenticate ngrok
from pyngrok import conf, ngrok
conf.get_default().auth_token = "2hDZVZnb996LAuUHNog0mPmk1MK_2xWzW8sTvhw73AQ3Sr1bK"  # Replace with your new token

# Start ngrok
ngrok_tunnel = ngrok.connect(8000)
print("Public URL:", ngrok_tunnel.public_url)

# Allow nested asyncio
nest_asyncio.apply()

# Run the FastAPI app
uvicorn.run(app, host="0.0.0.0", port=8000)




INFO:     Started server process [6411]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Public URL: https://0cfb-34-73-94-253.ngrok-free.app
INFO:     117.201.33.182:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     117.201.33.182:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


INFO:     117.201.33.182:0 - "GET /generatee/?text=Guvi%20is HTTP/1.1" 200 OK


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


INFO:     117.201.33.182:0 - "GET /generatee/?text=Guvi%20is HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [6411]


KeyboardInterrupt: 

In [None]:
import requests

# Define the public URL and the endpoint
public_url = "https://0cfb-34-73-94-253.ngrok-free.app/"
endpoint = "/generate/"

# Combine the URL and endpoint
url = public_url + endpoint

# Define the data to be sent in the POST request
data = {
    "text": "Guvi is known for"
}

try:
    # Send the POST request
    response = requests.post(url, json=data)

    # Check if the request was successful
    if response.status_code == 200:
        # Attempt to parse JSON response
        try:
            json_response = response.json()
            print("Response:", json_response)
        except json.JSONDecodeError as json_err:
            print(f"Failed to decode JSON response: {json_err}")
            print("Response text:", response.text)
    else:
        print(f"Request failed with status code {response.status_code}: {response.text}")

except requests.RequestException as req_err:
    print(f"Request error: {req_err}")


Request failed with status code 404: <!DOCTYPE html>
<html class="h-full" lang="en-US" dir="ltr">
  <head>
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-Regular-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-RegularItalic-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-Medium-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-Semibold-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-MediumItalic-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/stati

In [None]:
!pip install streamlit transformers torch

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.

In [None]:
%%writefile app.py
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
#model_name_or_path = "./fine_tuned_model"
model_name_or_path = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )
    generated_texts = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(num_return_sequences)]
    return generated_texts

# Streamlit app
st.title("Text Generation with GPT-2")
st.write("This app generates text using a fine-tuned GPT-2 model. Enter a prompt and the model will generate a continuation.")

seed_text = st.text_input("Enter your prompt:", "Google is known for")
max_length = st.slider("Max Length:", min_value=50, max_value=500, value=100)
temperature = st.slider("Temperature:", min_value=0.1, max_value=2.0, value=1.0)

if st.button("Generate"):
    with st.spinner("Generating text..."):
        generated_texts = generate_text(model, tokenizer, seed_text, max_length, temperature)
        for i, generated_text in enumerate(generated_texts):
            st.subheader(f"Generated Text {i + 1}")
            st.write(generated_text)

Writing app.py


In [None]:
!pip install pyngrok



In [None]:
from pyngrok import conf, ngrok
import subprocess
import time

# Authenticate ngrok
conf.get_default().auth_token = "2hDZVZnb996LAuUHNog0mPmk1MK_2xWzW8sTvhw73AQ3Sr1bK"

# Run the Streamlit app in the background
process = subprocess.Popen(['streamlit', 'run', 'app.py'])

# Give the Streamlit app a few seconds to start
time.sleep(5)

# Expose the Streamlit app to the web using ngrok
public_url = ngrok.connect(addr="8501")
print(f"Public URL: {public_url}")

# Keep the Colab cell running
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Stopping Streamlit app...")
    process.terminate()
    ngrok.disconnect(public_url)
    ngrok.kill()

Public URL: NgrokTunnel: "https://701c-34-73-94-253.ngrok-free.app" -> "http://localhost:8501"
Stopping Streamlit app...


PyngrokNgrokURLError: ngrok client exception, URLError: [Errno 111] Connection refused