In [46]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import numpy as np
from transformers import AdamW, get_linear_schedule_with_warmup
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
df = pd.read_csv('questions.csv')
mytext = df['question1'].tolist() 
mytext[:10]


['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use シ instead of し?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?']

In [48]:
max_seq_len = 512
vocab_size = 30000
embed_dim = 512
num_layers = 6
num_heads = 8
hidden_dim = 1024
batch_size = 16
epochs = 10
learning_rate = 1e-4
warmup_steps = 1000

In [49]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=max_seq_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode(text, max_length=self.max_len, padding='max_length', truncation=True)
        return torch.tensor(encoding)

In [50]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, num_heads, hidden_dim, max_seq_len):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = self.get_positional_encoding(max_seq_len, embed_dim)
        
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        
        self.fc = nn.Linear(embed_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)
    
    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.fc(x)
        return self.softmax(x)
    
    def get_positional_encoding(self, max_seq_len, embed_dim):
        pe = torch.zeros(max_seq_len, embed_dim)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-np.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

In [51]:
texts = mytext
tokenizer = word_tokenize
train_texts, val_texts = train_test_split(texts, test_size=0.2)

train_dataset = TextDataset(train_texts, tokenizer)
val_dataset = TextDataset(val_texts, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model = TransformerModel(vocab_size=vocab_size, embed_dim=embed_dim, num_layers=num_layers,
                         num_heads=num_heads, hidden_dim=hidden_dim, max_seq_len=max_seq_len)
optimizer = AdamW(model.parameters(), lr=learning_rate)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=epochs*len(train_loader))
criterion = nn.CrossEntropyLoss()



In [52]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch.long().to('cpu')
            outputs = model(input_ids)
            
            outputs = outputs.view(-1, vocab_size)
            input_ids = input_ids.view(-1)
            loss = criterion(outputs, input_ids)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}')

In [53]:
def evaluate_model(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids =batch.long().to('cpu')
            outputs = model(input_ids)
            
            outputs =outputs.view(-1, vocab_size)
            input_ids = input_ids.view(-1)
            loss = criterion(outputs, input_ids)
            total_loss +=loss.item()
    return total_loss/len(val_loader)

In [54]:
model.to('cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)
train_model(model,train_loader, val_loader, criterion, optimizer,scheduler, epochs)
val_loss = evaluate_model(model, val_loader,criterion)
print(f'Validation Loss:{val_loss}')

AttributeError: 'function' object has no attribute 'encode'