In [1]:
!pip install transformers
!pip install bitsandbytes
!pip install datasets
!pip install accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (4

In [2]:
from google.colab import drive
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [3]:
import transformers
import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd
from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
from tqdm.auto import tqdm

In [4]:
class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias

    def forward(self, input):
        output = torch.clone(DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias))
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"



In [5]:
class DequantizeAndLinear(torch.autograd.Function):
    @staticmethod
    @custom_fwd
    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias)

    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias


In [6]:
class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None

    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"



In [7]:
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)

    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)

In [8]:
def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr(
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    )
                )

In [9]:
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J



In [10]:
#config = transformers.GPTJConfig.from_pretrained(f"/content/drive/MyDrive/ML Bootcamp/Capstone/lyric_generation/gpt-j-6b/checkpoint-{checkpoint_num}")
config = transformers.GPTJConfig.from_pretrained("dzionek/distilgpt2-rap")
tokenizer = transformers.AutoTokenizer.from_pretrained("dzionek/distilgpt2-rap")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

You are using a model of type gpt2 to instantiate a model of type gptj. This is not supported for all configurations of models and can yield errors.


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [21]:
def add_adapters(model, adapter_dim=16):
    assert adapter_dim > 0

    for module in model.modules():
        if isinstance(module, FrozenBNBLinear):
            module.adapter = nn.Sequential(
                nn.Linear(module.in_features, adapter_dim, bias=False),
                nn.Linear(adapter_dim, module.out_features, bias=False),
            )
            nn.init.zeros_(module.adapter[1].weight)
        elif isinstance(module, FrozenBNBEmbedding):
            module.adapter = nn.Sequential(
                nn.Embedding(module.num_embeddings, adapter_dim),
                nn.Linear(adapter_dim, module.embedding_dim, bias=False),
            )
            nn.init.zeros_(module.adapter[1].weight)


    return model

In [13]:
import random
from datasets import Dataset, DatasetDict

def randomize_lines(data_path):
    with open(data_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        print(len(lines))
        random.shuffle(lines)
        return lines

train_data_path = '/content/MyDrive/MyDrive/NLP Project/train_couplets.txt'
test_data_path = '/content/MyDrive/MyDrive/NLP Project/test_couplets.txt'

train_lines = randomize_lines(train_data_path)
test_lines = randomize_lines(test_data_path)

train_dataset = Dataset.from_dict({"text": train_lines})
test_dataset = Dataset.from_dict({"text": test_lines})


# Concatenate train and test datasets into a single dataset
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

tokenizer = transformers.AutoTokenizer.from_pretrained("dzionek/distilgpt2-rap")

def tokenize_function(examples):
  return tokenizer(examples["text"])

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns="text")

del tokenizer
del dataset

832
8


  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/832 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/8 [00:00<?, ? examples/s]

In [14]:
block_size = 128
checkpoint_num = 8000

def group_texts(examples):
  # Concatenate all texts.
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  #print(concatenated_examples)
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  #print(f"Total length: {total_length}")
  total_length = (total_length // block_size) * block_size
  # Split by chunks of max_len.
  result = {
    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
    for k, t in concatenated_examples.items()
  }
  result["labels"] = result["input_ids"].copy()
  return result

verses_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=500,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/832 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/8 [00:00<?, ? examples/s]

In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("dzionek/distilgpt2-rap")
gpt = add_adapters(model)

pytorch_model.bin:   0%|          | 0.00/334M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [23]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM

training_args = TrainingArguments(
    output_dir="/content", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=100, # number of training epochs
    # evaluation_strategy="steps",
    # per_device_train_batch_size=4, # batch size for training
    # per_device_eval_batch_size=4,  # batch size for evaluation
    # # eval_steps = 50, # Number of update steps between two evaluations.
    # # save_steps = 2000, # after # steps model is saved
    # save_total_limit = 999,# limits total # of checkpoints
    # load_best_model_at_end = True, # save best checkpoint after training complete
    # warmup_steps = 50,# number of warmup steps for learning rate scheduler
    # # prediction_loss_only=True,
    # save_strategy="steps",  #changed from "steps" to "no" to try to use callback to save entire model
    # weight_decay=0.001,
    # resume_from_checkpoint=f"/content-{checkpoint_num}")
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=verses_datasets['train'],
    eval_dataset=verses_datasets['test'],
    )
    #callbacks=[SaveCallback])

trainer.train()
# print(trainer)

Step,Training Loss
500,1.3023
1000,0.7334
1500,0.4702
2000,0.3355
2500,0.257
3000,0.2072
3500,0.1707
4000,0.1471
4500,0.1333


TrainOutput(global_step=4600, training_loss=0.41116010645161505, metrics={'train_runtime': 885.046, 'train_samples_per_second': 40.789, 'train_steps_per_second': 5.197, 'total_flos': 1179101587046400.0, 'train_loss': 0.41116010645161505, 'epoch': 100.0})

In [24]:
torch.save(gpt, "/content/MyDrive/MyDrive/NLP Project/rap_lyrics_model_distilbert.pt")

In [50]:
import re
def generate_verse():
    prompt = "And that ass feel like jello (jello)"
    device = torch.cuda.current_device()

    print("...getting prediction...")
    tokenizer = transformers.AutoTokenizer.from_pretrained("dzionek/distilgpt2-rap")
    # model = model
    with torch.no_grad():
        result_length = 75
        prompt = "~ " + prompt + " =1G->2G= "
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        beam_outputs = model.generate(inputs["input_ids"],
            max_length=result_length,
            top_k=50, top_p=0.95,
            do_sample=True, temperature=0.7, pad_token_id=50256,
            num_return_sequences=10)

        lines = []
        for beam in beam_outputs:
            text = tokenizer.decode(beam, skip_special_tokens=True)
            line = text.split(" =1G->2G= ")[1]
            line = line[:line.find(" ~")]
            if line not in lines:
                lines.append(line.strip("'").strip('"'))

        filtered_lines = [replace_n_word(line) for line in lines] #remove n-word
        # filtered_lines = [line for line in filtered_lines if has_characters(line)]

        if len(filtered_lines) == 0:
            return generate_verse()  # Recursively retry if no suitable lines found

        # Delete unnecessary variables to conserve memory
        del inputs, beam_outputs, lines

        return filtered_lines

def replace_n_word(line):
    # Define a regular expression pattern to match n-word and hyphenated cases
    pattern = r'\b\w*-?nigga\w*\b'
    # Use re.search to find the pattern in the line
    line = re.sub(pattern,"*****", line, flags=re.IGNORECASE)
    # returns line with stars if present or w no modifications if word not found
    return line

def has_characters(self,line):
# Strip removes leading and trailing whitespaces including tabs and newlines
     return False if line.strip() == "" else True


In [51]:
x = generate_verse()

...getting prediction...


In [52]:
print(x)

["iced out, wrapped around your finger ]\n[ i'd rather keep it real with ya =1G->1P= ay-m r-ih|t-ax-n ao-l ow|v-er y-uw ]\n~ yeah ", "\n[ drugs got me sweatin', but the room gettin' colder ]\n[ high-high, get-get, gettin' high, gettin' high =2G->2P= eh-m eh-m ]\n[ but a bitch ca", '\n~ y-ae =1P->2P->2P= y-ae', '\n~ w-er|k-ax-n ax-n ax w-iy|k-eh-n-d l-ay-k y-uw|zh-ax|w-ax-l ]\n~ ay k-ae-n', "\n[ jh-iy y-uw-z-d t-ao-t f-ao-r y-uw aa-n =2P->2G= just take me as i am, it's the same me ]\n[ ", '\n[ jh-ah-s-t d-r-iy-m =2P->2G= just dream ]\n[ ay jh-ah-s-t d-r-iy-m', 'ery-m, ih-n dh-ax m-ey-k ax s-l-ey-m m-iy', '\n[ m-ay g-er-l s-p-ay-t-s m-iy ih-n v-ey-n ao-l ay d-uw ih-z k-ax-m|p-l-e', '\n[ m-ay g-ey-t eh-m ah-p f-ah-k ih-t ay-m aa-n w-ah-n =2P->2G= my leather black jeans on (black) ]\n', 'ery|v-er|iy t-ay-m ay f-ao-l y-uw n-ow n-ow']


In [None]:
# remove whatever is there between equals


In [None]:
# unphonemize and reverse engineer the generated verses


In [49]:

tokenizer = transformers.AutoTokenizer.from_pretrained("dzionek/distilgpt2-rap")
input_ids = tokenizer.encode("take it easy start with a bat", return_tensors='pt')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move the input_ids tensor to the same device as the model
input_ids = input_ids.to(device)
output = model.generate(input_ids, max_length=100, num_return_sequences=5, do_sample=True, top_p=0.9)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


take it easy start with a bat and gun ~
[ m-ay g-er-l s-p-ay-t-s ae-t early morning m-ay ih-n v-ey-n ao-l ay d-uw =2P->2G= early in the morning, late at night (i will wait for you) ]
[ early in the morning, late at night (i will wait for you) =2G->


In [None]:

tokenizer = transformers.AutoTokenizer.from_pretrained("dzionek/distilgpt2-rap")
input_ids = tokenizer.encode("from concrete jungles to stars above, this beats the canvas, lets paint with blood", return_tensors='pt')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move the input_ids tensor to the same device as the model
input_ids = input_ids.to(device)
output = model.generate(input_ids, max_length=100, num_return_sequences=5, do_sample=True, top_p=0.9)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


take it easy start with a bat and gun ~
[ m-ay g-er-l s-p-ay-t-s ae-t early morning m-ay ih-n v-ey-n ao-l ay d-uw =2P->2G= early in the morning, late at night (i will wait for you) ]
[ early in the morning, late at night (i will wait for you) =2G->


In [None]:

tokenizer = transformers.AutoTokenizer.from_pretrained("dzionek/distilgpt2-rap")
input_ids = tokenizer.encode("Mom's spaghetti knees weak arms are heavy", return_tensors='pt')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move the input_ids tensor to the same device as the model
input_ids = input_ids.to(device)
output = model.generate(input_ids, max_length=100, num_return_sequences=5, do_sample=True, top_p=0.9)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


take it easy start with a bat and gun ~
[ m-ay g-er-l s-p-ay-t-s ae-t early morning m-ay ih-n v-ey-n ao-l ay d-uw =2P->2G= early in the morning, late at night (i will wait for you) ]
[ early in the morning, late at night (i will wait for you) =2G->


In [1]:
!pip install eng_to_ipa
import eng_to_ipa as ipa

def is_vow(character):
    '''
    Is the given (lowercase) character a vowel or not.
    '''
    #ipa_vowels = "iɪeɛæuʊoɔɑəʌ"
    ipa_vowels = "yøœɶɒɔoʊuʉiɪeɛæaɐɑʌɤɯɨɜ"
    return character in ipa_vowels

def is_space(character):
    '''
    Is the given character a space or newline (other space characters are
    cleaned in the preprocessing phase).
    '''
    return character==' ' or character=='\n'

def get_phonetic_transcription(lyrics):
    lines = lyrics.splitlines()
    phonetic_lines = [ipa.convert(line) for line in lines]
    phonetic = "\n".join(phonetic_lines)

    return phonetic

Collecting eng_to_ipa
  Downloading eng_to_ipa-0.0.2.tar.gz (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: eng_to_ipa
  Building wheel for eng_to_ipa (setup.py) ... [?25l[?25hdone
  Created wheel for eng_to_ipa: filename=eng_to_ipa-0.0.2-py3-none-any.whl size=2822607 sha256=804d9ccd67821836b60485be58be66b7eb4e0ab2f1a2752b75c6aeda03b4d4c9
  Stored in directory: /root/.cache/pip/wheels/5b/ab/07/fe6722f710d8ef8bd0ccb4eb689ef96f5552f3fc0c80c1aa9c
Successfully built eng_to_ipa
Installing collected packages: eng_to_ipa
Successfully installed eng_to_ipa-0.0.2


In [2]:
import re
import numpy as np



def syllable_similarity(bar1, bar2):
    similarity = abs(syllables.estimate(bar1) - syllables.estimate(bar2))
    return similarity

def get_syllable_count_difference(lyrics):
    lines = lyrics.splitlines()
    i = 0
    similarity = 0
    while i < len(lines)-1:
        similarity += syllable_similarity(lines[i], lines[i+1])
        i += 2
    return similarity/len(lines)

def get_rhyme_density(lyrics, lookback=15):
    bars = Lyrics(lyrics, lookback)
    return bars.avg_rhyme_length

def get_longest_rhyme(lyrics, lookback=15):
    bars = Lyrics(lyrics, lookback)
    return bars.get_longest_rhyme()[0]

def get_unique_words(lyrics):
    words = lyrics.split()
    unique_words = set(words)
    return len(unique_words)/len(words)

def print_lyrics_stats(lyrics, lookback=15, artist=None, title=None):
    bars = Lyrics(lyrics, lookback, artist, title)
    print(bars.title)
    print('------------------------------------------')
    print("Average rhyme length: %.3f\n" % bars.avg_rhyme_length)
    bars.print_rhyme(bars.longest_rhyme)
    print("Average syllable count difference: %.3f\n" % get_syllable_count_difference(lyrics))
    print("Percentage unique words: %.3f\n" % get_unique_words(lyrics))




class Lyrics:
    '''
    This class is used to store and preprocess rap lyrics and calculate
    statistics like average rhyme length out of the lyrics.
    '''

    def __init__(self, text=None, lookback=15, artist=None, title=None):
        '''
        Lyrics can be read from the file (default) or passed directly
        to this constructor.
        '''
        self.text_raw = None
        # How many previous words are checked for a rhyme.
        self.lookback = lookback
        self.text_raw = text
        if artist == None or title == None:
            self.title = "Generated Song"
        else:
            self.title = title + " by " + artist

        if self.text_raw is not None:
            cleaning_ok = self.clean_text(self.text_raw)
            self.compute_vowel_representation()
            self.avg_rhyme_length, self.longest_rhyme = self.rhyme_stats()

    def clean_text(self, text):
        '''
        Preprocess text by removing unwanted characters and duplicate rows.
        '''

        self.text = text
        # If there are more than 2 consecutive newlines, remove some of them
        # (just to make the cleaned text look prettier)
        self.text = re.sub('\n\n+', '\n\n', self.text)
        # Remove duplicate rows
        self.lines = self.text.split('\n')

        uniq_lines = set()
        new_text = ''
        for l in self.lines:
            l = l.strip()
            if len(l) > 0 and l in uniq_lines:
                continue
            # Remove lines that are within brackets/parenthesis
            if len(l) >= 2 and ((l[0]=='[' and l[-1]==']') or (l[0]=='(' and l[-1]==')')):
                continue
            uniq_lines.add(l)
            new_text += l + '\n'

        self.text = new_text

    def compute_vowel_representation(self):
        '''
        Compute a representation of the lyrics where only vowels are preserved.
        '''
        self.vow = [] # Lyrics with all but vowels removed
        self.vow_idxs = [] # Indices of the vowels in self.text list
        self.word_ends = [] # Indices of the last characters of each word
        self.words = [] # List of words in the lyrics
        self.line_idxs = []

        self.text_orig = self.text
        self.text = get_phonetic_transcription(self.text)
        self.word_ends_orig = []
        self.words_orig = []

        prev_space_idx = -1 # Index of the previous space char
        line_idx = 0 # Line index of the current character
        # Go through the lyrics char by char
        for i in range(len(self.text)):
            self.line_idxs.append(line_idx)
            c = self.text[i]
            #c = ph.map_vow(c)
            if is_vow(c):
                # Ignore double vowels
                # (in English this applies probably only to 'aa' as in 'bath'
                # which rhymes with 'trap' that has only 'a')
                if i > 0 and self.text[i-1] == c:
                    # Index of a double vowel points to the latter occurrence
                    self.vow_idxs[-1] = i
                    continue
                # TODO Diftongs should not be split (i.e. "price" should
                # not rhyme with "trap kit"). This has been fixed in BattleBot
                self.vow.append(c)
                self.vow_idxs.append(i)
            elif is_space(c):
                if c in '\n':
                    line_idx += 1
                elif c in '.!?' and i < len(self.text)-1 and self.text[i+1] != '\n':
                    line_idx += 1
                # If previous char was not a space, we've encountered word end
                if len(self.vow) > 0 and not is_space(self.text[i-1]):
                    # Put together the new word. Potential consonants in the
                    # end are ignored
                    new_word = self.text[prev_space_idx+1:self.vow_idxs[-1]+1]
                    # Check that the new word contains at least one vowel
                    no_vowels = True
                    for c2 in new_word:
                        if is_vow(c2):
                            no_vowels = False
                            break
                    if no_vowels:
                        prev_space_idx = i
                        continue
                    self.word_ends.append(len(self.vow)-1)
                    self.words.append(new_word)
                prev_space_idx = i

        self.lines_orig = self.text_orig.split('\n')

    def rhyme_length(self, wpos2):
        '''
        Length of rhyme (in vowels). The latter part of the rhyme ends with
        word self.words[wpos2].

        Input:
            wpos2       Word index of the end of the rhyme.
        '''
        max_length = 0
        max_wpos1 = None
        wpos1 = max(0,wpos2-self.lookback)
        while wpos1 < wpos2:
            rl = self.rhyme_length_fixed(wpos1, wpos2)
            if rl > max_length:
                max_length = rl
                max_wpos1 = wpos1
            wpos1 += 1
        return max_length, max_wpos1

    def rhyme_length_fixed(self, wpos1, wpos2):
        '''
        Length of rhyme (in vowels). The first part of the rhyme ends with
        self.words[wpos1] and the latter part with word self.words[wpos2].

        Input:
            wpos1       Word index of the last word in the first part of the rhyme.
            wpos2       Word index of the end of the rhyme.
        '''
        if wpos1 < 0: # Don't wrap
            return 0
        elif self.words[wpos1] == self.words[wpos2]:
            return 0
        # Indices in the vowel list
        p1 = self.word_ends[wpos1]
        p2 = self.word_ends[wpos2]
        l = 0
        while self.vow[p1-l] == self.vow[p2-l]:
            # Make sure that exactly same words are not used
            if wpos1 > 0 and p1-l <= self.word_ends[wpos1-1] and wpos2 > 0 and p2-l <= self.word_ends[wpos2-1]:
                # Get the first and last character indices of the words surrounding the vowels at p1-l and p2-l
                prev_s1 = self.vow_idxs[p1-l]
                while prev_s1 > 0 and not is_space(self.text[prev_s1-1]):
                    prev_s1 -= 1
                prev_s2 = self.vow_idxs[p2-l]
                while prev_s2 > 0 and not is_space(self.text[prev_s2-1]):
                    prev_s2 -= 1
                next_s1 = self.vow_idxs[p1-l]
                while next_s1 < len(self.text)-1 and not is_space(self.text[next_s1+1]):
                    next_s1 += 1
                next_s2 = self.vow_idxs[p2-l]
                while next_s2 < len(self.text)-1 and not is_space(self.text[next_s2+1]):
                    next_s2 += 1
                if next_s1-prev_s1 == next_s2-prev_s2 and self.text[prev_s1:next_s1+1] ==  self.text[prev_s2:next_s2+1]:
                    break

            l += 1
            if p1-l < 0 or p2-l <= p1:
                break
        # Ignore rhymes with length 1
        if l == 1:
            l = 0
        return l

    def rhyme_stats(self):
        '''
        Compute the average rhyme length of the song and the longest rhyme.

        Output:
            Average rhyme length (float)
            Longest rhyme which is a 3-tuple with:
                (length, word index of the first part of the rhyme,
                         word index of the latter part of the rhyme)
        '''
        # Rhyme length of each word
        rls = []
        # Keep track of the longest rhyme
        max_rhyme = (0,None,None)
        for wpos2 in range(1,len(self.word_ends)):
            (rl, wpos1) = self.rhyme_length(wpos2)
            rls.append(rl)
            if rl > max_rhyme[0]:
                max_rhyme = (rl, wpos1, wpos2)
        rls = np.array(rls)
        # Average rhyme length of the song
        if len(rls) > 0:
            avg_rl = np.mean(rls)
        else:
            avg_rl = 0
        return avg_rl, max_rhyme

    def get_avg_rhyme_length(self):
        return self.avg_rhyme_length

    def print_song_stats(self):
        print('------------------------------------------')
        print("Avg rhyme length: %.3f\n" % self.avg_rhyme_length)

        self.print_rhyme(self.longest_rhyme)


    def print_rhyme(self, rhyme_tuple):
        print(self.get_rhyme_str(rhyme_tuple))

    def get_rhyme_str(self, rhyme_tuple):
        '''
        Construct a string of a given rhyme tuple.
        '''
        ret = ''
        rl, wpos1, wpos2 = rhyme_tuple
        if wpos1 is None or wpos2 is None:
            return ''
        p2 = self.vow_idxs[self.word_ends[wpos2]]
        p2_orig = p2
        # Find the ending of the last word
        while not is_space(self.text[p2]):
            p2 += 1
        p0 = self.vow_idxs[self.word_ends[wpos1]-rl]
        p0_orig = p0
        # Find the beginning of the line
        while self.text[p0] != '\n' and p0 > 0:
            p0 -= 1

        cap_line = ''
        rw1, rw2 = self.get_rhyming_vowels(rhyme_tuple)
        for i in range(p0,p2+1):
            if i == min(rw1) or i == min(rw2):
                cap_line += ' | ' + self.text[i]
            elif i == max(rw1) or i == max(rw2):
                cap_line += self.text[i] + '|'
            else:
                cap_line += self.text[i]
        ret += "Longest rhyme (l=%d): %s\n" % (rl, cap_line)
        # Get the corresponding lines from the original lyrics
        line_beg = self.line_idxs[p0]
        line_end = self.line_idxs[p2]
        for i in range(line_beg, line_end+1):
            if i < len(self.lines_orig):
                ret += self.lines_orig[i] + '\n'
        return ret

    def get_longest_rhyme(self):
        rhyme_str = self.get_rhyme_str(self.longest_rhyme)
        return self.longest_rhyme[0], rhyme_str

    def get_rhyming_vowels(self, rhyme_tuple):
        '''
        Return the indices of the rhyming vowels of the longest rhyme.

        Output:
            Tuple with the indices of the first part and the second part of
            the rhyme separately.
        '''
        rl, wpos1, wpos2 = rhyme_tuple
        if wpos1 is None or wpos2 is None:
            return ([-1],[-1])

        # The first part of the rhyme
        rhyme_idxs1 = [] # Indices of the rhyming vowels
        n_caps = 0
        p = self.vow_idxs[self.word_ends[wpos1]]
        while n_caps < rl:
            if is_vow(self.text[p]):
                rhyme_idxs1.append(p)
                # Increase the counter only if the vowel is not a double vowel
                if self.text[p] != self.text[p+1]:
                    n_caps += 1
            p -= 1

        # The second part of the rhyme
        rhyme_idxs2 = [] # Indices of the rhyming vowels
        n_caps = 0
        p = self.vow_idxs[self.word_ends[wpos2]]
        p_last = p
        while n_caps < rl:
            if is_vow(self.text[p]):
                rhyme_idxs2.append(p)
                # Increase the counter only if the vowel is not a double vowel.
                # The last vowel must be always counted.
                if p == p_last or self.text[p] != self.text[p+1]:
                    n_caps += 1
            p -= 1

        return (rhyme_idxs1, rhyme_idxs2)

In [3]:

import numpy as np
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import bigrams, ngrams, everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import MLE, KneserNeyInterpolated, Lidstone, Laplace, AbsoluteDiscountingInterpolated


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
final_verse = "Your finger, wrapped around, out iced. I'd rather keep it real with you, all over me now. Yeah. Colder getting room, but sweatin' me got drugs. High-high, get-get, getting high, getting high. A bitch can. Yeah, yeah. Yeah, yeah. You like walking and working. I know. I'm taking that to heart for you, just as I am, just the same me. Just dream. I'm dreaming, in that make me slim, me. My girl spits me in vain all day in this complex. My leather black jeans got 'em on, every time I fall you now, now."
result_final = re.sub(r'[ ]+\n[ ]+', r'\n', ''.join(final_verse))
print(result_final)


Your finger, wrapped around, out iced. I'd rather keep it real with you, all over me now. Yeah. Colder getting room, but sweatin' me got drugs. High-high, get-get, getting high, getting high. A bitch can. Yeah, yeah. Yeah, yeah. You like walking and working. I know. I'm taking that to heart for you, just as I am, just the same me. Just dream. I'm dreaming, in that make me slim, me. My girl spits me in vain all day in this complex. My leather black jeans got 'em on, every time I fall you now, now.


In [6]:
def get_rhyme_density(lyrics, lookback=15):
    bars = Lyrics(lyrics, lookback)
    return bars.avg_rhyme_length
results_rd = np.array(get_rhyme_density(result_final))

In [7]:
round(np.mean(results_rd),2)

0.99

In [8]:
results_lr = np.array(get_longest_rhyme(result_final))

In [9]:
round(np.mean(results_lr),2)

6.0

In [10]:
results_uw = np.array(get_unique_words(result_final))

In [11]:
print(round(np.mean(results_uw),2))

0.79
