<a href="https://colab.research.google.com/github/1ucky40nc3/TREX/blob/research/generation_research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
import torch

from transformers import GPTNeoForCausalLM
from transformers import GPT2Tokenizer

from typing import List
from typing import Union
from typing import Any

In [3]:
class GPTNeoGenerator:
    def __init__(self, name, device=None):
        self.name = name

        self.tokenizer = GPT2Tokenizer.from_pretrained(name)
        self.model = GPTNeoForCausalLM.from_pretrained(
            name, pad_token_id=self.tokenizer.eos_token_id)

        self.device = device if device else self.get_device()
        self.model = self.model.to(self.device)

    def __call__(self, input: str, **kwargs) -> List[str]:
        input = self.tokenizer(
            input, return_tensors="pt"
            ).to(self.device)

        tokens = self.model.generate(
            **{**input, **kwargs})
        
        tokens = self.tokenizer.batch_decode(
            tokens, **kwargs)
        return tokens

    def greedy(self, 
               input: str,
               num_return_sequences: str=1,
               **kwargs) -> List[str]:
        kwargs = {**locals(), **kwargs}
        del kwargs["self"]
        del kwargs["num_return_sequences"]

        return [self.__call__(**kwargs)[0] for _ in range(num_return_sequences)]

    def beam(self,
             input: str,
             num_beams: int,
             early_stopping: bool=True,
             **kwargs) -> List[str]:
        kwargs = {**locals(), **kwargs}
        del kwargs["self"]

        return self.__call__(**kwargs)

    def sample(self,
               input: str,
               top_k: int=0,
               do_sample: bool=True,
               **kwargs) -> List[str]:
        kwargs = {**locals(), **kwargs}
        del kwargs["self"]

        return self.__call__(**kwargs)

    def top_k(self,
              input: str,
              top_k: int,
              **kwargs) -> List[str]:
        kwargs = {**locals(), **kwargs}
        del kwargs["self"]

        return self.sample(**kwargs)

    def top_p(self,
              input: str,
              top_p: float,
              **kwargs) -> List[str]:
        kwargs = {**locals(), **kwargs}
        del kwargs["self"]

        return self.sample(**kwargs)

    def get_device(self) -> str:
        return "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
gpt_neo_125m = GPTNeoGenerator("EleutherAI/gpt-neo-125M")

In [5]:
gpt_neo_125m("Hello", 
             max_length=100,
             num_beams=5,
             num_return_sequences=5, 
             temperature=0.1)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


['Hello everyone, I’m so excited to announce that I’m going to be writing this blog post. I’ve been working on this blog for a while now, and I’m excited to share it with you.\n\nI’ve been working on this blog for a while now, and I’m excited to share it with you. I’ve been working on this blog for a while now, and I’m excited to share',
 'Hello everyone, I’m so excited to announce that I’m going to be writing this blog post. I’ve been working on this for a while now, and I’ve been working on it for a while now. I’ve been working on it for a while now, and I’ve been working on it for a while now. I’ve been working on it for a while now, and I’ve been working on it',
 'Hello everyone, I’m so excited to announce that I’m going to be writing this blog post. I’ve been working on this for a while now, and I’ve been working on it for a while now. I’ve been working on this for a while now, and I’ve been working on it for a while now. I’ve been working on this for a while now, and I’ve been wo

In [4]:
#@title Utils
"""Taken from: https://stackoverflow.com/questions/7267226/range-for-floats/67053708#67053708"""

def frange(start, stop, step, n=None):
    """return a WYSIWYG series of float values that mimic range behavior
    by excluding the end point and not printing extraneous digits beyond
    the precision of the input numbers (controlled by n and automatically
    detected based on the string representation of the numbers passed).

    EXAMPLES
    ========

    non-WYSIWYS simple list-comprehension

    >>> [.11 + i*.1 for i in range(3)]
    [0.11, 0.21000000000000002, 0.31]

    WYSIWYG result for increasing sequence

    >>> list(frange(0.11, .33, .1))
    [0.11, 0.21, 0.31]

    and decreasing sequences

    >>> list(frange(.345, .1, -.1))
    [0.345, 0.245, 0.145]

    To hit the end point for a sequence that is divisibe by
    the step size, make the end point a little bigger by
    adding half the step size:

    >>> dx = .2
    >>> list(frange(0, 1 + dx/2, dx))
    [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

    """
    if step == 0:
        raise ValueError('step must not be 0')
    # how many decimal places are showing?
    if n is None:
        n = max([0 if '.' not in str(i) else len(str(i).split('.')[1])
                for i in (start, stop, step)])
    if step*(stop - start) > 0:  # a non-null incr/decr range
        if step < 0:
            for i in frange(-start, -stop, -step, n):
                yield -i
        else:
            steps = round((stop - start)/step)
            while round(step*steps + start, n) < stop:
                steps += 1
            for i in range(steps):
                yield round(start + i*step, n)

def collect_generation_data(input: str, 
                            model: GPTNeoGenerator,
                            beams=list(range(10, 100, 50)),
                            temps=list(frange(0.1, 1.5, 0.8)),
                            topks=list(range(10, 100, 50)),
                            topps=list(frange(0.1, 1, 0.3)),
                            **kwargs) -> dict:
    data = {
        "config": locals(),
        "results": {
            "greedy": [],
            "beam": [],
            "sample": [],
            "top_k": [],
            "top_p": [],
            "top_pk": [],
        }
    }

    # Collect samples decoded via Greedy Search
    data["results"]["greedy"] = model.greedy(input, **kwargs)
    data["results"]["beam"] = [
        {
            "num_beams": n, 
            "results": [
                model.beam(
                    input, 
                    num_beams=n, 
                    **kwargs)
            ]
         } for n in beams
    ]
        
    # Collect samples decoded via Sampling
    data["results"]["sample"] = [
        {
            "temperature": t,
            "results": model.sample(
                input, 
                temperature=t, 
                **kwargs)
        } for t in temps
    ]
    # Collect samples decoded via Top-k Sampling
    data["results"]["top_k"] = [
        {
            "top_k": k,
            "results": [
                {
                    "temperature": t,
                    "results": model.top_k(
                        input, 
                        top_k=k, 
                        temperature=t, 
                        **kwargs)
                } for t in temps
            ]
        } for k in topks
    ]
    # Collect samples decoded via Top-p Sampling
    data["results"]["top_p"] = [
        {
            "top_p": p,
            "results": [
                {
                    "temperature": t,
                    "results": model.top_p(
                        input,
                        top_p=p,
                        temperature=t, 
                        **kwargs)
                } for t in temps
            ]
        } for p in topps
    ]
    # Collect samples decoded via Top-p and Top-k Sampling
    data["results"]["top_pk"] = [
        {
            "top_p": p,
            "results": [
                {
                    "top_k": k,
                    "results": [
                        {
                            "temperature": t,
                            "results": model.top_k(
                                input,
                                top_p=p,
                                top_k=k,
                                temperature=t, 
                                **kwargs)
                        } for t in temps
                    ]
                } for k in topks
            ]
        } for p in topps
    ]

    return data


def recurse_through_data(data: Union[dict, list, str], key: str) -> Union[List[str], str]:
    """Find elements in given data specified via a key."""
    if isinstance(data, dict) and key in data:
        return recurse_through_data(data[key], key)

    if isinstance(data, list):
        return [recurse_through_data(element, key) for element in data]

    return data


import numpy as np

def flatten(l: list) -> list:
    return list(np.array(l).flat)


def process_data(data: dict, key: str) -> list:
    """Filter data by key and return results as list."""
    data = recurse_through_data(data, key)

    if isinstance(data, list):
        return flatten(data)
    
    return [data]


def repeat_in_intervall(l: list, num_repeat: int, invervall: int) -> list:
    """Repeat the elements of a list n times and do so every intervall."""
    return [i for _ in range(invervall) for i in l for _ in range(num_repeat)]


def export_data(data: dict):
    config = data["config"]
    results = data["results"]

    beams = config["beams"]
    temps = config["temps"]
    topks = config["topks"]
    topps = config["topps"]

    nseqs = config["kwargs"]["num_return_sequences"]
    nbeam = len(beams)
    ntemp = len(temps)
    ntopk = len(topks)
    ntopp = len(topps)

    export = {}

    export["Greedy Search"] = results["greedy"]

    export["Number Beams"] = repeat_in_intervall(beams, nseqs, 1)
    export["Beam Search"] = process_data(results["beam"], "results")

    export["Temperature Sample"] = repeat_in_intervall(temps, nseqs, 1)
    export["Sample"] = process_data(results["sample"], "results")

    export["Number Top-k"] = repeat_in_intervall(topks, nseqs*ntemp, 1)
    export["Temperature Top-k"] = repeat_in_intervall(temps, nseqs, ntopk)
    export["Top-k"] = process_data(results["top_k"], "results")

    export["Number Top-p"] = repeat_in_intervall(topps, nseqs*ntemp, 1)
    export["Temperature Top-p"] = repeat_in_intervall(temps, nseqs, ntopp)
    export["Top-p"] = process_data(results["top_p"], "results")

    export["Top-p & Top-k Number Top-p"] = repeat_in_intervall(topps, nseqs*ntemp*ntopk, 1)
    export["Top-p & Top-k Number Top-k"] = repeat_in_intervall(topks, nseqs*ntemp, ntopp)
    export["Temperature Top-p & Top-k"] = repeat_in_intervall(temps, nseqs, ntopk*ntopp)
    export["Top-p & Top-k"] = process_data(results["top_pk"], "results")

    for column in export:
        export[column] = export[column] + [''] * (ntopp*nseqs*ntemp*ntopk - len(export[column]))

    return export

In [6]:
config = {
    'max_length': 25,
    'num_return_sequences': 2,
}

In [8]:
data = collect_generation_data("Hi", gpt_neo_125m, **config)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [44]:
export = export_data(data)

In [None]:
import pandas as pd
import datetime


df = pd.DataFrame(export)

model = data["config"]["model"].name
model = model.replace("/", "_")

time = datetime.datetime.now()
time = f"{now:%Y%m%d%H%M}"

df.to_excel(f"GenerationResearch_{model}_{time}.xlsx", )