# Creating GSM8K Dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install datasets==1.0.2
!pip install tqdm==4.57.0
!pip install Cython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!mkdir '/content/gdrive/My Drive/marc'
!mkdir '/content/gdrive/My Drive/marc/dataset'
!mkdir '/content/gdrive/My Drive/marc/model'
!mkdir '/content/gdrive/My Drive/marc/tokenizer'
!ls

mkdir: cannot create directory ‘/content/gdrive/My Drive/marc’: File exists
mkdir: cannot create directory ‘/content/gdrive/My Drive/marc/dataset’: File exists
mkdir: cannot create directory ‘/content/gdrive/My Drive/marc/model’: File exists
mkdir: cannot create directory ‘/content/gdrive/My Drive/marc/tokenizer’: File exists
gdrive	lightning_logs	sample_data


In [None]:
import pandas as pd
import json
from sklearn.utils import shuffle
import re

pd.set_option("display.max_colwidth", None)

path_to_training_csv = f"/content/gdrive/My Drive/marc/dataset/training_dataset.csv"
path_to_validation_csv = f"/content/gdrive/My Drive/marc/dataset/validation_dataset.csv"
path_to_training_json = f"/content/gdrive/My Drive/marc/train.jsonl"
path_to_validation_json = f"/content/gdrive/My Drive/marc/test.jsonl"



class QuestionDatasetPreparation:
    @staticmethod
    def __prepare_dataset(df, dataset):
        position = 0
        for data in dataset:
            question, answer, explanation = data["question"], data["solution"], data["answer"]
            df.loc[position] = [question] + [answer] + [explanation]
            position += 1

    @staticmethod
    def __extract_answer(completion):
        ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")

        match = ANS_RE.search(completion)

        if match:
            return match.group(1).strip().replace(",", "")
        
        raise "Invalid Answer"
    
    @staticmethod
    def __open_math_dataset(path):
       with open(path) as f:
          dataset = [json.loads(line) for line in f.readlines() if line]

       for data in dataset:
          data.update(question=data["question"] + "\n")
          data.update(answer=data["answer"] + "<|endoftext|>")
          data["solution"] = QuestionDatasetPreparation.__extract_answer(data["answer"])

       return dataset

    @staticmethod
    def prepare():
        training_dataset = QuestionDatasetPreparation.__open_math_dataset(path_to_training_json)
        validation_dataset = QuestionDatasetPreparation.__open_math_dataset(path_to_validation_json)

        # Create Training & Validation DataFrames
        df_training = pd.DataFrame(columns=['question', 'answer', 'explanation'])
        df_validation = pd.DataFrame(columns=['question', 'answer', 'explanation'])

        # Prepare Training & Validation Datasets
        QuestionDatasetPreparation.__prepare_dataset(df_training, training_dataset)
        QuestionDatasetPreparation.__prepare_dataset(df_validation, validation_dataset)

        # Shuffle Dataset
        df_training = shuffle(df_training)
        df_validation = shuffle(df_validation)

        # Save To CSV Files
        df_training.to_csv(path_to_training_csv, index=False)
        df_validation.to_csv(path_to_validation_csv, index=False)

In [None]:
QuestionDatasetPreparation.prepare()

In [1]:
!pip install --quiet transformers==4.27.3
!pip install --quiet datasets==2.10.1
!pip install --quiet torch==2.0.0
!pip install --quiet scikit-learn==1.2.2
!pip install --quiet sentencepiece
!pip install --quiet tokenizers

In [2]:
# Mount Google Drive To Access Dataset
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import json
import re
from datasets import Dataset

def process_answer(answer):
    # Process Answer From GSM8K Dataset
    answer = re.sub(r"<<.*?>>", "", answer)
    answer = answer.replace("####", "The answer is")
    return answer

def read_dataset(path):
    with open(path) as f:
        dataset = [json.loads(line) for line in f.readlines() if line]
      
    for data in dataset:
        data["answer"] = process_answer(data["answer"])
    
    return dataset

def load_math_dataset():
    path_to_training_json = f"/content/gdrive/My Drive/marc/train.jsonl"
    path_to_test_json = f"/content/gdrive/My Drive/marc/test.jsonl"

    training_dataset = Dataset.from_list(read_dataset(path_to_training_json))
    testing_dataset = Dataset.from_list(read_dataset(path_to_test_json))

    return training_dataset, testing_dataset


In [13]:
import datasets
import numpy as np
import transformers
from datasets import Dataset
from functools import partial
import torch

def preprocess_data(dataset, tokenizer):
    inputs = dataset["question"]
    targets = dataset["answer"]

    model_inputs = tokenizer(inputs, max_length=1024, padding="max_length", truncation=True)
    labels = tokenizer(text_target=targets, max_length=256, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

def begin_training(training_dataset, testing_dataset):
    tokenizer = transformers.AutoTokenizer.from_pretrained("google/flan-t5-base")
    model = transformers.AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

    processing_function = partial(preprocess_data, tokenizer=tokenizer)

    training_dataset = training_dataset.map(
        processing_function,
        batched=True,
        remove_columns=training_dataset.column_names,
        load_from_cache_file=False,
        desc="Tokenizing Training Dataset"
    )

    testing_dataset = testing_dataset.map(
        processing_function,
        batched=True,
        num_proc=8,
        remove_columns=testing_dataset.column_names,
        load_from_cache_file=False,
        desc="Tokenizing Test Dataset"
    )

    training_arguments = transformers.TrainingArguments(
        output_dir="/content/gdrive/My Drive/marc/flan-model",
        evaluation_strategy="steps",
        eval_steps=20,
        do_train=True,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        eval_accumulation_steps=1
    )

    trainer = transformers.Trainer(
        model=model,
        args=training_arguments,
        train_dataset=training_dataset,
        eval_dataset=testing_dataset
    )

    trainer.train("/content/gdrive/My Drive/marc/flan-model/checkpoint-1500")

    trainer.save_model("/content/gdrive/My Drive/marc/flan-model/model")
    tokenizer.save_pretrained("/content/gdrive/My Drive/marc/flan-model/tokenizer")

def train_model():
    training_dataset, testing_dataset = load_math_dataset()
    begin_training(training_dataset, testing_dataset)

In [14]:
train_model()

Tokenizing Training Dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

Tokenizing Test Dataset (num_proc=8):   0%|          | 0/1319 [00:00<?, ? examples/s]



  0%|          | 0/2264 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss


In [19]:
import torch
torch.cuda.empty_cache()
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from google.colab import drive
drive.mount('/content/gdrive')
path_to_tokenizer = "/content/gdrive/My Drive/marc/flan-model/tokenizer"
path_to_model = "/content/gdrive/My Drive/marc/flan-model/model"

tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer)
model = AutoModelForSeq2SeqLM.from_pretrained(path_to_model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
model = model.to(device)

class TestModel:
    @staticmethod
    def test_t5(question):
      encoded_text = tokenizer.encode(question, return_tensors="pt").cuda()
      model_output = model.generate(encoded_text, do_sample=True, top_p=0.9, max_length=512).cpu()
      answer = tokenizer.decode(model_output[0], skip_special_tokens=True)
      return answer

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Device:  cuda


In [26]:
question = "Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?"
print(TestModel.test_t5(question))

If Eliza worked for 45 hours this week, then she earned 40*10=$800 She received an overtime pay of 1.2*90=$120 The total amount of earnings she earned is 10*120=$600 This week, Eliza earned a total of $1200 + $600 = $1800 #### 1800


# Tune Model

In [None]:
!pip install --quiet transformers==4.27.3
!pip install --quiet datasets==2.10.1
!pip install --quiet torch==2.0.0
!pip install --quiet scikit-learn==1.2.2
!pip install --quiet sentencepiece
!pip install --quiet tokenizers

In [30]:
from datasets import Dataset
import datasets
import numpy as np
import transformers
from datasets import Dataset
from functools import partial
import torch
torch.cuda.empty_cache()
from datetime import datetime

class ModelTuner:
    @staticmethod
    def __prepare_training_testing_dataset(process_dataset, training_dataset, testing_dataset):
        training_dataset = training_dataset.map(
            process_dataset,
            batched=True,
            remove_columns=training_dataset.column_names,
            load_from_cache_file=False,
            desc="Tokenizing Training Dataset"
        )

        testing_dataset = testing_dataset.map(
            process_dataset,
            batched=True,
            num_proc=8,
            remove_columns=testing_dataset.column_names,
            load_from_cache_file=False,
            desc="Tokenizing Test Dataset"
        )

        return training_dataset, testing_dataset

    @staticmethod
    def __get_training_arguments(output_directory):
        return transformers.TrainingArguments(
            output_dir=output_directory,
            evaluation_strategy="steps",
            eval_steps=20,
            do_train=True,
            num_train_epochs=1,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=4,
            eval_accumulation_steps=1
        )

    @staticmethod
    def tune_model(training_dataset, testing_dataset, process_data, model_name, output_directory):
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_name)
        
        process_dataset = partial(process_data, tokenizer=tokenizer)

        training_dataset, testing_dataset = ModelTuner.__prepare_training_testing_dataset(process_dataset, training_dataset, testing_dataset)
        training_arguments = ModelTuner.__get_training_arguments(output_directory)

        trainer = transformers.Trainer(
            model=model,
            args=training_arguments,
            train_dataset=training_dataset,
            eval_dataset=testing_dataset
        )

        trainer.train()

        now = datetime.now()
        timestamp = now.strftime("%d-%m-%Y %H:%M:%S").replace(" ", "-")

        trainer.save_model(f"{output_directory}/model/{timestamp}")
        tokenizer.save_pretrained(f"{output_directory}/tokenizer/{timestamp}")

        print("Model Trained Successfully")

# Train To Solve Maths Problems


In [31]:
# Mount Google Drive To Access Dataset
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [32]:
import json
import re
from datasets import Dataset

# Load GSM8K Dataset
class MathDataset:
    @staticmethod
    def __process_answer(answer):
        # Process Answer From GSM8K Dataset
        CALCULATOR_PATTERN = r"<<.*?>>"
        SOLUTION_PATTERN = "####"
        answer = re.sub(CALCULATOR_PATTERN, "", answer)
        answer = answer.replace(SOLUTION_PATTERN, "The answer is")
        return answer

    @staticmethod
    def __read_dataset(path):
        with open(path) as f:
            dataset = [json.loads(line) for line in f.readlines() if line]
        
        for data in dataset:
           data["answer"] = MathDataset.__process_answer(data["answer"])
        
        return dataset
    
    @staticmethod
    def process_data(dataset, tokenizer):
        # Generate Inputs and Targets
        inputs, targets = dataset["question"], dataset["answer"]

        # Tokenize Input
        model_inputs = tokenizer(inputs, max_length=1024, padding="max_length", truncation=True)
        model_labels = tokenizer(text=targets, max_length=256, padding="max_length", truncation=True)

        model_inputs["labels"] = model_labels["input_ids"]
        return model_inputs

    @staticmethod
    def load_dataset(path_to_training_json, path_to_test_json):
        training_dataset = Dataset.from_list(MathDataset.__read_dataset(path_to_training_json))
        testing_dataset = Dataset.from_list(MathDataset.__read_dataset(path_to_test_json))

        return training_dataset, testing_dataset

def train_model():
    output_directory = "/content/gdrive/My Drive/marc/"
    path_to_training_json = "/content/gdrive/My Drive/marc/train.jsonl"
    path_to_test_json = "/content/gdrive/My Drive/marc/test.jsonl"
    model_name = "google/flan-t5-base"

    # Load Dataset
    training_dataset, testing_dataset = MathDataset.load_dataset(path_to_training_json, path_to_test_json)
    
    # Tune Model
    ModelTuner.tune_model(training_dataset, testing_dataset, MathDataset.process_data, model_name, output_directory)

train_model()

Tokenizing Training Dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

Tokenizing Test Dataset (num_proc=8):   0%|          | 0/1319 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


Model Trained Successfully
