#RNN

##Preparing the Dataset

In [None]:
!pip install datasets

import torch
from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

# Load the IMDb dataset
dataset = load_dataset('imdb')

# Tokenize the text
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for item in data_iter:
        yield tokenizer(item['text'])

# Build the vocabulary
vocab = build_vocab_from_iterator(yield_tokens(dataset['train']), specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

def data_process(data_iter):
    data = [torch.tensor([vocab[token] for token in tokenizer(item['text'])], dtype=torch.long) for item in data_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

train_data = data_process(dataset['train'])
test_data = data_process(dataset['test'])

# Convert data to tensors and create batches
seq_length = 30
def create_batches(data, seq_length):
    num_batches = data.size(0) // seq_length
    data = data[:num_batches * seq_length]
    data = data.view(seq_length, -1).t().contiguous()
    inputs = data[:, :-1]
    targets = data[:, 1:].reshape(-1)
    return inputs, targets

train_inputs, train_targets = create_batches(train_data, seq_length)
test_inputs, test_targets = create_batches(test_data, seq_length)

# DataLoader
batch_size = 20
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_targets)
test_dataset = torch.utils.data.TensorDataset(test_inputs, test_targets)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

# Sample text data
text = """ Guvi Geek Networks Private Limited, commonly known as GUVI, is an Indian technology company focused on providing skill development and training in various domains related to information technology and computer science. GUVI was founded with the mission of bridging the skill gap in the IT industry by offering practical, industry-relevant training. It caters to the growing demand for skilled professionals in emerging technologies like artificial intelligence, machine learning, data science, and programming.

GUVI operates an online learning platform offering courses, tutorials, and projects designed to enhance technical skills. It focuses on providing hands-on learning experiences through coding challenges, real-world projects, and interactive sessions. The platform offers a wide range of courses covering programming languages such as Python, Java, C++, web development, mobile app development, data structures, algorithms, and more. Specialized courses in advanced topics like artificial intelligence, machine learning, blockchain, and cybersecurity are also available.

GUVI provides certificates of completion for its courses, valuable for showcasing skills to potential employers. The platform collaborates with industry experts and companies to develop and deliver content that meets industry standards and demands. It fosters a community of learners through forums, discussion boards, and collaborative projects, offering mentorship and support to help learners succeed in their career goals.

Emphasizing innovation in education technology, GUVI continuously updates its course offerings to align with industry trends and technological advancements. The platform aims to make quality technical education accessible to a global audience, promoting skill development and career advancement.

The founder of GUVI is Aravindhan Vivekanandan, who established the company with the vision of democratizing technical education and making programming skills accessible to everyone, particularly focusing on students and professionals aspiring to build careers in the technology sector. GUVI is located in Chennai, India, where they manage their online learning platform and conduct various educational programs aimed at skill development in IT and related fields.

GUVI offers a variety of courses across different domains of technology and programming, including programming languages, web development, mobile app development, data structures, algorithms, artificial intelligence, machine learning, blockchain, and cybersecurity. For the most current and detailed information about their course offerings, it is best to visit GUVI's official website or directly explore their course catalog.

GUVI focuses on providing technical education and skill development through online courses and training programs. While they may offer career support and guidance, including assistance with resume building and interview preparation, specific details about guaranteed job placements or partnerships with companies for placements are not prominently featured in their offerings. For accurate information on their placement services and support, it is recommended to contact GUVI directly or explore their website.

GUVI was founded in 2014 with the aim of providing accessible and practical technical education through online courses and training programs, focusing on programming and IT skills development. Specific information about "friend companies" or official partner companies of GUVI is not publicly detailed. For accurate information about GUVI's partnerships or affiliations with other companies, it is recommended to visit GUVI's official website or reach out to them directly through their official communication channels."""

# Tokenize the text
tokenizer = get_tokenizer('basic_english')
tokens = tokenizer(text.lower())

# Build the vocabulary
counter = Counter(tokens)
vocab = build_vocab_from_iterator([tokens], specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

# Numericalize the text
data = [vocab[token] for token in tokens]

# Convert data to tensors and create batches
seq_length = 30
def create_batches(data, seq_length):
    sequences = [data[i:i+seq_length] for i in range(0, len(data)-seq_length)]
    inputs = torch.tensor([seq[:-1] for seq in sequences], dtype=torch.long)
    targets = torch.tensor([seq[-1] for seq in sequences], dtype=torch.long)
    return inputs, targets

inputs, targets = create_batches(data, seq_length)
train_data, val_data = inputs[:int(0.8*len(inputs))], inputs[int(0.8*len(inputs)):]
train_targets, val_targets = targets[:int(0.8*len(targets))], targets[int(0.8*len(targets)):]

# DataLoader
train_dataset = torch.utils.data.TensorDataset(train_data, train_targets)
val_dataset = torch.utils.data.TensorDataset(val_data, val_targets)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=20, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=20)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


##Implementing RNN Model


In [None]:
import torch.nn as nn
import torch.optim as optim

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize the model, criterion, and optimizer
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 100
output_dim = vocab_size

model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())


##Training the RNN Model

In [None]:
def train_epoch(loader, model, criterion, optimizer):
    model.train()
    total_loss = 0
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(loader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            output = model(inputs)
            loss = criterion(output, targets)
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, 15):
    train_loss = train_epoch(train_loader, model, criterion, optimizer)
    val_loss = evaluate(val_loader, model, criterion)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Epoch 1, Train Loss: 5.4332, Val Loss: 5.3521
Epoch 2, Train Loss: 4.9268, Val Loss: 5.2462
Epoch 3, Train Loss: 4.4511, Val Loss: 5.1952
Epoch 4, Train Loss: 4.0437, Val Loss: 5.1450
Epoch 5, Train Loss: 3.6796, Val Loss: 5.0714
Epoch 6, Train Loss: 3.3349, Val Loss: 5.0088
Epoch 7, Train Loss: 3.0062, Val Loss: 4.9471
Epoch 8, Train Loss: 2.6945, Val Loss: 4.9258
Epoch 9, Train Loss: 2.4080, Val Loss: 4.8628
Epoch 10, Train Loss: 2.1448, Val Loss: 4.8442
Epoch 11, Train Loss: 1.8952, Val Loss: 4.8259
Epoch 12, Train Loss: 1.6775, Val Loss: 4.8325
Epoch 13, Train Loss: 1.4809, Val Loss: 4.8240
Epoch 14, Train Loss: 1.3045, Val Loss: 4.8249


##TESTING or generation

In [None]:
def generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0):
    model.eval()
    tokens = tokenizer(seed_text.lower())
    input_ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = seed_text
    with torch.no_grad():
        for _ in range(next_words):
            output = model(input_ids)
            output = output.squeeze(0)  # Remove the batch dimension
            output = output / temperature
            probabilities = torch.nn.functional.softmax(output, dim=-1)
            next_token_id = torch.multinomial(probabilities, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)
            generated_text += ' ' + next_token
            next_input = torch.tensor([[next_token_id]], dtype=torch.long).to(device)
            input_ids = torch.cat((input_ids, next_input), dim=1)

    return generated_text

seed_text = "Guvi is known for "
generated_text = generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=0.6)
print(generated_text)


Guvi is known for  through official industry and programming . guvi is located in chennai , tutorials , and career advancement . it focuses on providing hands-on is updates its course offerings to align with industry trends and demands . science through collaborates , data structures , development , data structures , algorithms ,


##LSTM On same data

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize the LSTM model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the LSTM model
for epoch in range(1, 15):
    train_loss = train_epoch(train_loader, model, criterion, optimizer)
    val_loss = evaluate(val_loader, model, criterion)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Epoch 1, Train Loss: 5.4205, Val Loss: 5.4126
Epoch 2, Train Loss: 5.1563, Val Loss: 5.3404
Epoch 3, Train Loss: 4.6923, Val Loss: 5.3062
Epoch 4, Train Loss: 4.3794, Val Loss: 5.4116
Epoch 5, Train Loss: 4.0412, Val Loss: 5.3458
Epoch 6, Train Loss: 3.7327, Val Loss: 5.3037
Epoch 7, Train Loss: 3.4310, Val Loss: 5.2628
Epoch 8, Train Loss: 3.1316, Val Loss: 5.2803
Epoch 9, Train Loss: 2.8495, Val Loss: 5.2775
Epoch 10, Train Loss: 2.5845, Val Loss: 5.2842
Epoch 11, Train Loss: 2.3417, Val Loss: 5.2571
Epoch 12, Train Loss: 2.1093, Val Loss: 5.2628
Epoch 13, Train Loss: 1.9026, Val Loss: 5.2296
Epoch 14, Train Loss: 1.7056, Val Loss: 5.3023


In [None]:
def generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0):
    model.eval()
    tokens = tokenizer(seed_text.lower())
    input_ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = seed_text
    with torch.no_grad():
        for _ in range(next_words):
            output = model(input_ids)
            output = output.squeeze(0)  # Remove the batch dimension
            output = output / temperature
            probabilities = torch.nn.functional.softmax(output, dim=-1)
            next_token_id = torch.multinomial(probabilities, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)
            generated_text += ' ' + next_token
            next_input = torch.tensor([[next_token_id]], dtype=torch.long).to(device)
            input_ids = torch.cat((input_ids, next_input), dim=1)

    return generated_text

seed_text = "Guvi is "
generated_text = generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0)
print(generated_text)


Guvi is  platform completion conduct to develop continuously focused help web in skills to india , collaborative information development global different . emphasizing in professionals to companies trends and technical platform skill offers a science machine blockchain , and for , aimed , specific platform with in it align development on artificial


##TRANSFORMER

In [None]:
import torch.nn as nn
import torch.optim as optim
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, nhead, num_encoder_layers, hidden_dim, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim)
        encoder_layers = nn.TransformerEncoderLayer(embedding_dim, nhead, hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x) * math.sqrt(embedding_dim)
        embedded = self.pos_encoder(embedded)
        output = self.transformer_encoder(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Initialize the model, criterion, and optimizer
vocab_size = len(vocab)
embedding_dim = 200
nhead = 2
num_encoder_layers = 2
hidden_dim = 200
output_dim = vocab_size

model = TransformerModel(vocab_size, embedding_dim, nhead, num_encoder_layers, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())




In [None]:
def train_epoch(loader, model, criterion, optimizer):
    model.train()
    total_loss = 0
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(loader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            output = model(inputs)
            loss = criterion(output, targets)
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, 15):
    train_loss = train_epoch(train_loader, model, criterion, optimizer)
    val_loss = evaluate(val_loader, model, criterion)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Epoch 1, Train Loss: 1.5047, Val Loss: 5.9452
Epoch 2, Train Loss: 1.4744, Val Loss: 5.9208
Epoch 3, Train Loss: 1.4751, Val Loss: 6.1004
Epoch 4, Train Loss: 1.4074, Val Loss: 6.1710
Epoch 5, Train Loss: 1.4121, Val Loss: 6.2699
Epoch 6, Train Loss: 1.4149, Val Loss: 6.3509
Epoch 7, Train Loss: 1.4101, Val Loss: 6.3124
Epoch 8, Train Loss: 1.4160, Val Loss: 6.4624
Epoch 9, Train Loss: 1.3794, Val Loss: 6.5233
Epoch 10, Train Loss: 1.3286, Val Loss: 6.6021
Epoch 11, Train Loss: 1.3623, Val Loss: 6.5965
Epoch 12, Train Loss: 1.3789, Val Loss: 6.6506
Epoch 13, Train Loss: 1.3339, Val Loss: 6.6630
Epoch 14, Train Loss: 1.3396, Val Loss: 6.6739


In [None]:
def generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0):
    model.eval()
    tokens = tokenizer(seed_text.lower())
    input_ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0).to(device)

    generated_text = seed_text
    with torch.no_grad():
        for _ in range(next_words):
            output = model(input_ids)
            output = output.squeeze(0)  # Remove the batch dimension
            output = output / temperature
            probabilities = torch.nn.functional.softmax(output, dim=-1)
            next_token_id = torch.multinomial(probabilities, num_samples=1).item()
            next_token = vocab.lookup_token(next_token_id)
            generated_text += ' ' + next_token
            next_input = torch.tensor([[next_token_id]], dtype=torch.long).to(device)
            input_ids = torch.cat((input_ids, next_input), dim=1)

    return generated_text

seed_text = "Guvi is "
generated_text = generate_text(model, seed_text, vocab, tokenizer, next_words=50, temperature=1.0)
print(generated_text)


Guvi is  aravindhan vivekanandan , web development , blockchain , algorithms , artificial intelligence , promoting skill development , and programming languages such as python , algorithms , including programming . guvi was founded with the platform offers a community of bridging the platform offers a wide range of courses , valuable


#GEMMA

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")

input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))


#GPT2

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Specify the model name or identifier
model_name = "gpt2"  # You can use other models like "gpt2-medium", "gpt2-large", "gpt2-xl", "gpt-neo-125M", etc.

# Load the pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
prompt = "Guvi is known for"
generated_text = text_generator(prompt, max_length=100, do_sample=True, temperature=0.7)

print(generated_text[0]['generated_text'])

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Guvi is known for his signature, which reads "I am a good man, but a bad man."

On this evening, the three women were taken to an undisclosed location.

The girls were taken to the University of Illinois at Urbana-Champaign Medical Center. The suspects were arrested.


In [None]:
# Install the transformers library if not already installed
!pip install transformers

# Import the necessary library
from huggingface_hub import login

# Login using the token
login(token="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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#LLM finetuning

In [None]:
pip install transformers datasets torch fastapi uvicorn

Collecting fastapi
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12

## data preprocessing

In [None]:
import os
import re
from transformers import GPT2Tokenizer

def print_raw_data(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            print(line.strip())

input_file = "/content/Guvi_Data.txt"  # Your company-specific data file
print_raw_data(input_file)

GUVI Alumni Offer Free Skill Training to 5,000 Students from Low Income Backgrounds Alumni of IIT Madras and IIM Ahmedabad had come together to offer free skill training to 5000 students from low-income backgrounds. The startup was formed by the alumni GUVI launched a new initiative called Fly High. Under the initiative, students will have access to vernacular tech courses, mentorship services from industry experts and training in skills to make them job-ready, completely free of cost. Students with a lower socio-economic background or family income of less than Rs 3 lakhs per annum can apply for the scholarship. Students should have graduated in 2020, 2021, 2022 with a CGPA of 7 The start-up is also launching Sridevi,      Innovation Award, which carries a cash award of Rs 1 Lakh, to recognize budding women entrepreneurs with remarkable innovations in the Ed-tech industry. The award winner will be selected by an eminent panel comprising of stalwarts in the field of the EdTech industry

In [None]:
pip install accelerate -U



##finetuneing the pretrained model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
train_dataset = load_dataset(output_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

NameError: name 'output_file' is not defined

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Input file path (replace with your actual input file)
input_file = "/content/Guvi_Data.txt"

# Function to load dataset
def load_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128,  # Adjust block size as needed
    )

# Create dataset using raw text file
train_dataset = load_dataset(input_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")



Step,Training Loss


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [None]:
!pip show accelerate

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


##TESTING THE MODEL

In [None]:
#!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name_or_path = "./fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

# Test the model
seed_text = "Guvi is known for"
generated_texts = generate_text(model, tokenizer, seed_text, max_length=10, temperature=0.7, num_return_sequences=3)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1:
Guvi is known for his work in the field

Generated Text 2:
Guvi is known for his work in the field

Generated Text 3:
Guvi is known for his work with community members



In [None]:
!pip install transformers fastapi uvicorn nest-asyncio pyngrok

Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [None]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import nest_asyncio
from pyngrok import ngrok
import uvicorn

app = FastAPI()

# Load the fine-tuned model and tokenizer
model_name_or_path = "./fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class Query(BaseModel):
    text: str

@app.post("/generate/")
async def generate_text(query: Query):
    input_ids = tokenizer.encode(query.text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return {"generated_text": generated_text}
@app.get("/generatee/")
async def generate_text(text: str):
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return {"generated_text": generated_text}

# Authenticate ngrok
from pyngrok import conf, ngrok
conf.get_default().auth_token = "2iPLGdSp7FXnuZfVi0kExkcCHhj_4NJ8WUitEz5wp8GgXgFnw"

# Start ngrok
ngrok_tunnel = ngrok.connect(8000)
print("Public URL:", ngrok_tunnel.public_url)

# Allow nested asyncio
nest_asyncio.apply()

# Run the FastAPI app
uvicorn.run(app, host="0.0.0.0", port=8000)

Public URL: https://fc13-34-16-167-87.ngrok-free.app


INFO:     Started server process [5922]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     203.28.245.177:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     203.28.245.177:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


INFO:     203.28.245.177:0 - "GET /generatee/?text=Guvi%20is HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [5922]


KeyboardInterrupt: 

https://422e-34-136-54-234.ngrok-free.app/generatee/?text=google%20is%20for

In [None]:
import requests

# Define the public URL and the endpoint
public_url = "https://fc13-34-16-167-87.ngrok-free.app/"
endpoint = "/generate/"


# Combine the URL and endpoint
url = public_url + endpoint

# Define the data to be sent in the POST request
data = {
    "text": "Guvi is known for"
}

try:
    # Send the POST request
    response = requests.post(url, json=data)

    # Check if the request was successful
    if response.status_code == 200:
        # Attempt to parse JSON response
        try:
            json_response = response.json()
            print("Response:", json_response)
        except json.JSONDecodeError as json_err:
            print(f"Failed to decode JSON response: {json_err}")
            print("Response text:", response.text)
    else:
        print(f"Request failed with status code {response.status_code}: {response.text}")

except requests.RequestException as req_err:
    print(f"Request error: {req_err}")

Request failed with status code 404: <!DOCTYPE html>
<html class="h-full" lang="en-US" dir="ltr">
  <head>
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-Regular-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-RegularItalic-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-Medium-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-Semibold-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/static/fonts/euclid-square/EuclidSquare-MediumItalic-WebS.woff" as="font" type="font/woff" crossorigin="anonymous" />
    <link rel="preload" href="https://cdn.ngrok.com/stati

#STEAMLIT APP ON THE LLM

In [None]:
!pip install streamlit transformers torch

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-

In [None]:
%%writefile app.py
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
#model_name_or_path = "./fine_tuned_model"
model_name_or_path = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )
    generated_texts = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(num_return_sequences)]
    return generated_texts

# Streamlit app
st.title("Text Generation with GPT-2")
st.write("This app generates text using a fine-tuned GPT-2 model. Enter a prompt and the model will generate a continuation.")

seed_text = st.text_input("Enter your prompt:", "Guvi is known for")
max_length = st.slider("Max Length:", min_value=50, max_value=500, value=100)
temperature = st.slider("Temperature:", min_value=0.1, max_value=2.0, value=1.0)

if st.button("Generate"):
    with st.spinner("Generating text..."):
        generated_texts = generate_text(model, tokenizer, seed_text, max_length, temperature)
        for i, generated_text in enumerate(generated_texts):
            st.subheader(f"Generated Text {i + 1}")
            st.write(generated_text)

Writing app.py


In [None]:
!pip install pyngrok



In [None]:
from pyngrok import conf, ngrok
import subprocess
import time

# Authenticate ngrok
conf.get_default().auth_token = "2iPLGdSp7FXnuZfVi0kExkcCHhj_4NJ8WUitEz5wp8GgXgFnw"

# Run the Streamlit app in the background
process = subprocess.Popen(['streamlit', 'run', 'app.py'])

# Give the Streamlit app a few seconds to start
time.sleep(5)

# Expose the Streamlit app to the web using ngrok
public_url = ngrok.connect(addr="8501")
print(f"Public URL: {public_url}")

# Keep the Colab cell running
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Stopping Streamlit app...")
    process.terminate()
    ngrok.disconnect(public_url)
    ngrok.kill()

Public URL: NgrokTunnel: "https://3186-34-16-167-87.ngrok-free.app" -> "http://localhost:8501"
