In [1]:
! pip install torch torchtext transformers sentencepiece pandas tqdm datasets

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [3]:
# load data
data_sample = load_dataset("csv", data_files="merged_file.csv")

In [4]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'cell_id_x', 'notebook_id', 'code_imports', 'defined_functions', 'source_x', 'cell_id_y', 'source_y'],
        num_rows: 828122
    })
})

In [5]:
updated_data = [{'Code': item['source_x'], 'Description': item['source_y']} for item in data_sample['train']]

In [23]:
df = pd.DataFrame(updated_data)

In [24]:
df.head(50)

Unnamed: 0,Code,Description
0,"import os\nPROJECT = ""cloud-training-demos"" # ...",# MNIST Image Classification with TensorFlow o...
1,%%bash\ngcloud config set project $PROJECT\ngc...,# MNIST Image Classification with TensorFlow o...
2,%%bash\nrm -rf mnistmodel.tar.gz mnist_trained...,## Run as a Python module\n\nIn the previous n...
3,%%bash\nOUTDIR=gs://${BUCKET}/mnist/trained_${...,"**Now, let's do it on Cloud ML Engine so we ca..."
4,from google.datalab.ml import TensorBoard\nTen...,## Monitoring training with TensorBoard\n\nUse...
5,"for pid in TensorBoard.list()[""pid""]:\n Ten...",## Monitoring training with TensorBoard\n\nUse...
6,"%%bash\nMODEL_NAME=""mnist""\nMODEL_VERSION=${MO...",## Deploying and predicting with model\n\nDepl...
7,"import json, codecs\nimport matplotlib.pyplot ...","To predict with the model, let's take one of t..."
8,%%bash\ngcloud ml-engine predict \\n --mode...,Send it to the prediction service
9,trainingInput:\n scaleTier: CUSTOM\n mas...,## DO NOT RUN anything beyond this point\n\nTh...


In [25]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split

In [26]:
if torch.cuda.is_available():
    device = torch.device('cuda')

else:
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')

In [27]:
device

device(type='cuda')

In [28]:
model_name = "sagard21/python-code-explainer"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [29]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32100, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [30]:
BATCH_SIZE = 8

In [31]:
df.describe()

Unnamed: 0,Code,Description
count,789629,826331
unique,552574,237424
top,df.head(),# Vertical Scan with Space Charge
freq,1028,1015


In [32]:
# Dataset preparation
class LanguageDataset(Dataset):
    def __init__(self, _df, _tokenizer):
        self.labels = _df.columns
        self.data = _df.to_dict(orient='records')
        self.tokenizer = _tokenizer
        x = self.fittest_max_length(_df)
        self.max_length = x
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, 
                                            return_tensors='pt', 
                                            max_length=1024, 
                                            truncation=True, 
                                            padding='max_length')
        return tokens
    
    def fittest_max_length(self, _df):
        max_length = max(len(max(_df[self.labels[0]], key=len)), len(max(_df[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x*2
        return x

In [33]:
data_sample = LanguageDataset(df, tokenizer)

TypeError: object of type 'NoneType' has no len()

In [34]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'cell_id_x', 'notebook_id', 'code_imports', 'defined_functions', 'source_x', 'cell_id_y', 'source_y'],
        num_rows: 828122
    })
})

In [35]:
train_size = int(0.8 * len(data_sample))
val_size = len(data_sample) - train_size

train_data, val_data = random_split(data_sample, [train_size, val_size])

