In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import warnings

In [42]:
warnings.filterwarnings("ignore", category=UserWarning)

In [43]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7a646cdc4a30>

In [44]:
data = pd.read_csv("/content/Book1.csv")


In [45]:
def preprocess_data(data):
    # Combine all output columns into a single target string
    data['target'] = data.apply(lambda row: f"Person: {row['Person']} | Wake Time: {row['Wake Time']} | Morning Activity: {row['Morning Activity']} | Return Time: {row['Return Time']} | Current Status: {row['Current Status']} | Task: {row['Task']} | Task Subject: {row['Task Subject']}", axis=1)
    return data

data = preprocess_data(data)

In [46]:
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)


In [47]:
class ParagraphDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        paragraph = self.data.iloc[idx]['Paragraph']
        target = self.data.iloc[idx]['target']

        # Tokenize the input and target
        input_encoding = self.tokenizer(
            f"generate structured data: {paragraph}",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            target,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }


In [48]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [49]:
train_dataset = ParagraphDataset(train_data, tokenizer)
val_dataset = ParagraphDataset(val_data, tokenizer)

In [50]:

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [51]:
optimizer = AdamW(model.parameters(), lr=5e-5)




In [52]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [55]:
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0

    # Training phase
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Training Loss: {avg_train_loss}")

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss}")



Epoch 1/3


100%|██████████| 17/17 [01:39<00:00,  5.88s/it]


Training Loss: 1.2965797536513384


100%|██████████| 2/2 [00:03<00:00,  1.85s/it]


Validation Loss: 0.482272133231163
Epoch 2/3


100%|██████████| 17/17 [01:38<00:00,  5.81s/it]


Training Loss: 1.026155345580157


100%|██████████| 2/2 [00:03<00:00,  1.82s/it]


Validation Loss: 0.35341329872608185
Epoch 3/3


100%|██████████| 17/17 [01:36<00:00,  5.70s/it]


Training Loss: 0.7559111924732432


100%|██████████| 2/2 [00:03<00:00,  1.67s/it]

Validation Loss: 0.28027577698230743





In [56]:
# Save the trained model
model.save_pretrained("t5-structured-data-model")
tokenizer.save_pretrained("t5-structured-data-tokenizer")

('t5-structured-data-tokenizer/tokenizer_config.json',
 't5-structured-data-tokenizer/special_tokens_map.json',
 't5-structured-data-tokenizer/spiece.model',
 't5-structured-data-tokenizer/added_tokens.json')

In [57]:


# Inference function
def generate_structured_data(paragraph, model, tokenizer, device):
    model.eval()
    input_ids = tokenizer.encode(f"generate structured data: {paragraph}", return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [58]:
sample_paragraph = "Daniel wakes up at 5:30 AM every day. Daniel reads a book and returns by noon. Currently, Daniel is in the cafe. Daniel has to buy snacks for the project."
structured_output = generate_structured_data(sample_paragraph, model, tokenizer, device)
print("Generated Structured Output:")
print(structured_output)

Generated Structured Output:

