<a href="https://colab.research.google.com/github/EmilisGit/Deep_learning/blob/main/lab2_poetry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install markovify -q
!pip install num2words -q
!pip install pronouncing -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for markovify (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pronouncing (setup.py) ... [?25l[?25hdone


In [3]:
import os, re, random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import keras
import markovify
import kagglehub
from num2words import num2words
import pronouncing
from keras.models import load_model

In [4]:
keras.utils.set_random_seed(812)
random.seed(812)

## 1. Duomenų atsiuntimas ir apžiūra

In [5]:
datapath = kagglehub.dataset_download("paultimothymooney/poetry")
data_files = os.listdir(datapath)
print(f'Downloaded {len(data_files)} files:', data_files)

Downloading from https://www.kaggle.com/api/v1/datasets/download/paultimothymooney/poetry?dataset_version_number=16...


100%|██████████| 2.00M/2.00M [00:00<00:00, 103MB/s]

Extracting files...
Downloaded 49 files: ['bob-dylan.txt', 'kanye-west.txt', 'beatles.txt', 'lil-wayne.txt', 'notorious_big.txt', 'nirvana.txt', 'eminem.txt', 'michael-jackson.txt', 'alicia-keys.txt', 'nicki-minaj.txt', 'blink-182.txt', 'disney.txt', 'rihanna.txt', 'r-kelly.txt', 'leonard-cohen.txt', 'nursery_rhymes.txt', 'notorious-big.txt', 'bob-marley.txt', 'missy-elliott.txt', 'lin-manuel-miranda.txt', 'dolly-parton.txt', 'cake.txt', 'kanye.txt', 'bruno-mars.txt', 'amy-winehouse.txt', 'dickinson.txt', 'bieber.txt', 'janisjoplin.txt', 'prince.txt', 'bjork.txt', 'britney-spears.txt', 'dr-seuss.txt', 'adele.txt', 'Lil_Wayne.txt', 'lorde.txt', 'bruce-springsteen.txt', 'joni-mitchell.txt', 'jimi-hendrix.txt', 'paul-simon.txt', 'nickelback.txt', 'Kanye_West.txt', 'dj-khaled.txt', 'drake.txt', 'radiohead.txt', 'lady-gaga.txt', 'al-green.txt', 'johnny-cash.txt', 'ludacris.txt', 'patti-smith.txt']





In [29]:
#@title Funkcijos
def normalize_word_line(line: str) -> list[str]:
  row = [x.lower() for x in re.findall(r"\w+'?\w*", line)]
  new_row = []
  for word in row:
    numbers = re.findall(r"\d+", word)
    for n in numbers:
      word = word.replace(n, num2words(int(n)))
    new_row.append(word)
  return new_row

In [7]:
#@title Ritmo išgavimas
def n_syllables(word_line: list[str]) -> int:
    vowels = 'aeiouy'
    syllable_count = 0

    for word in word_line:
        for i, char in enumerate(word):
            if char in vowels:
                if (i == 0) or (word[i-1] not in vowels):
                    syllable_count += 1
        word_vowels_count = sum([x in vowels for x in word])
        if word_vowels_count == 0:  # abreviaturos
            syllable_count = len(word)  # pvz. "NLP" skaitosi kaip "en-el-pi"
        elif word_vowels_count > 1 and (word[-1] == 'e') and (word[-2] not in vowels):  # paskutinė "e" dažnai nesakoma anglų k., bet "ie", "ee" ištariama
            syllable_count -= 1

    return syllable_count

def get_rhyme(line: list[str]) -> str:
    last_word = re.sub('\W+', '', line[-1])
    all_rhymes = pronouncing.rhymes(last_word)
    if all_rhymes:
        rhyming_ends = [x[-2:] for x in all_rhymes]
        most_common_rhyme = max(set(rhyming_ends), key=rhyming_ends.count)
    else:
        most_common_rhyme = last_word[-2:]
    return most_common_rhyme


def get_rhyme_list(normalized_lyrics: list[list[str]]):
  rhyme_set = set()
  for row in normalized_lyrics:
    most_common_rhyme = get_rhyme(row)
    rhyme_set.add(most_common_rhyme)

  sorted_rhyme_set = sorted(list(rhyme_set), key=lambda x: x[-1])
  return sorted_rhyme_set

In [8]:
print("n_syllables output: ", n_syllables(["good day everyone"]))
print("get_rhyme output: ", get_rhyme(["good day everyone"]))
print(get_rhyme_list([["that's the strat"], ["make it fast"]]))

n_syllables output:  5
get_rhyme output:  ne
['st', 'at']


## 4. Duomenų rinkinio paruošimas

Modelis negalės tiesiogiai operuoti skiemenimis, todėl turime naudoti skaičius:

In [9]:
#@title Duomenų rinkinio paruošimas
def get_rhyme_float(line: list[str], rhyme_list: list[str]) -> float | None:
  rhyme = get_rhyme(line)
  if rhyme in rhyme_list:
    return rhyme_list.index(rhyme) / len(rhyme_list)
  else:
    return None

def length_test(sentence):
    max_words = 8
    return len(sentence.split()) <= max_words


def get_random_lines(markov_model, n_rows: int) -> list[list[str]]:
  lines = []
  last_words = []

  while len(lines) < n_rows:
    line = markov_model.make_short_sentence(max_chars=40)
    # nenorime gauti tuščios eilutės ar jau turimos eilutės
    if (line is not None) and (line not in lines):
      last_word = normalize_word_line(line)[-1]
      # nenorime kad dažnai pasikartotų tas pats žodis eilutės gale
      if last_words.count(last_word) < 3:
        lines.append(normalize_word_line(line))
        last_words.append(last_word)

  return lines


def get_line_features(line: list[str], rhyme_list: list[str]) -> tuple:
  return (line, n_syllables(line), get_rhyme_float(line, rhyme_list))

def build_dataset(lines: list[list[str]], rhyme_list: list[str]):
	features = [get_line_features(x, rhyme_list) for x in lines]
	x_data, y_data = [], []

  # turėsime standartinę struktūrą kai eilutės rimuojasi po 4 grupėje
	# pirmos dvi eilutės bus pradinės savybės, antros dvi eilutės - prognozuojamos
	for i in range(len(features) - 3):
		# duomenyse liks tik eilučių savybes, todėl visur [1:]
		line1, line2 = features[i    ][1:], features[i + 1][1:]
		line3, line4 = features[i + 2][1:], features[i + 3][1:]
		x_data.append(np.array([line1, line2]))
		y_data.append(np.array([line3, line4]))
	return np.array(x_data), np.array(y_data)

## 5. RNN modelio inicializavimas

Mūsų modelis turės prognozuoti naujų 4 eilučių savybes gaunant senas 4 eilutes.

In [11]:
def create_lstm(depth: int):

  keras.backend.clear_session()  # pašaliname tarpinių modelių likučius
  keras.utils.set_random_seed(812)

  model = keras.Sequential(name='LSTM-based_lyrics_generator')
  model.add(keras.layers.Input((2, 2)))
  model.add(keras.layers.LSTM(16, return_sequences=True))
  for i in range(depth):
    model.add(keras.layers.LSTM(16, return_sequences=True))
  model.add(keras.layers.LSTM(2, return_sequences=True))

  model.compile(
      optimizer=keras.optimizers.RMSprop(learning_rate=0.001),
      loss='mse')

  return model

model = create_lstm(depth=2)
model.summary()

In [12]:
#@title Dviejų modelių sistema
def compose(starting_input: np.ndarray, rnn_model, n_line_groups: int):
	final_vectors = []
	starting_vectors = rnn_model.predict(starting_input).flatten().reshape(1, 2, 2)
	final_vectors.append(starting_vectors)
	for i in range(n_line_groups):
		prev_vectors = final_vectors[-1]
		final_vectors.append(rnn_model.predict(prev_vectors).flatten().reshape(1, 2, 2))
	return final_vectors


def last_word_compare(prev_lines: list[list[str]], new_line: list[str], penalty: float = 0.2) -> float:
	sum_penalty = 0.0
	for line in prev_lines:
		if line[-1] == new_line[-1]:
			sum_penalty += penalty
	return sum_penalty


def calculate_score(features, n_syllables, rhyme, penalty: float, rhyme_list, maxsyllables):
	desired_n_syllables = features[0] * maxsyllables
	desired_rhyme = features[1] * len(rhyme_list)
	syllable_score = - abs(float(desired_n_syllables) - float(n_syllables))
	rhyme_score = 2.0 * abs(float(desired_rhyme) - float(rhyme))
	score = 1.0 + syllable_score + rhyme_score - penalty
	return score


def vectors_into_song(vectors, generated_lyrics, rhyme_list, maxsyllables: int):
	song = []
	generated_features = [get_line_features(x, rhyme_list) for x in generated_lyrics]

	vector_halves = []
	for vector in vectors:
		vector_halves.extend(vector[0].tolist())

	for vector in vector_halves:
		scorelist = []

		for (line, n_syllables, rhyme) in generated_features:
			if len(song) != 0:
				penalty = last_word_compare(song, line)
			else:
				penalty = 0

			total_score = calculate_score(vector, n_syllables, rhyme, penalty, rhyme_list, maxsyllables)
			scorelist.append([line, total_score])

		# randame eilutę su aukščiausiu įvertinimu
		best_line_index = np.argmax([float(x[1]) for x in scorelist])
		best_line = scorelist[best_line_index][0]
		song.append(best_line)

		# pašaliname šią eilutę iš likusių eilučių sąrašo
		generated_features = [x for x in generated_features if x[0] != best_line]

	return [' '.join(x) for x in song]

In [23]:
import joblib

class LyricsGenerator:
    def __init__(self, lstm_model, markov_model, lyrics, maxsyllables=12):
        self.lstm_model = lstm_model
        self.markov_model = markov_model
        self.rhyme_list = self.get_rhyme_list(lyrics)
        self.maxsyllables = maxsyllables
        self.x_data = None
        self.y_data = None

    def n_syllables(self, word_line: list[str]) -> int:
        vowels = 'aeiouy'
        syllable_count = 0

        for word in word_line:
            for i, char in enumerate(word):
                if char in vowels:
                    if (i == 0) or (word[i-1] not in vowels):
                        syllable_count += 1
            word_vowels_count = sum([x in vowels for x in word])
            if word_vowels_count == 0:  # abreviaturos
                syllable_count = len(word)  # pvz. "NLP" skaitosi kaip "en-el-pi"
            elif word_vowels_count > 1 and (word[-1] == 'e') and (word[-2] not in vowels):  # paskutinė "e" dažnai nesakoma anglų k., bet "ie", "ee" ištariama
                syllable_count -= 1

        return syllable_count


    def get_rhyme_float(self, line: list[str]) -> float | None:
      rhyme = self.get_rhyme(line)
      if rhyme in self.rhyme_list:
        return self.rhyme_list.index(rhyme) / len(self.rhyme_list)
      else:
        return None

    def length_test(self, sentence):
        max_words = 8
        return len(sentence.split()) <= max_words


    def get_line_features(self, line: list[str]) -> tuple:
      return (line, self.n_syllables(line), self.get_rhyme_float(line))

    def build_dataset(self, lines: list[list[str]]):
      features = [self.get_line_features(x) for x in lines]
      x_data, y_data = [], []

      # turėsime standartinę struktūrą kai eilutės rimuojasi po 4 grupėje
      # pirmos dvi eilutės bus pradinės savybės, antros dvi eilutės - prognozuojamos
      for i in range(len(features) - 3):
        # duomenyse liks tik eilučių savybes, todėl visur [1:]
        line1, line2 = features[i    ][1:], features[i + 1][1:]
        line3, line4 = features[i + 2][1:], features[i + 3][1:]
        x_data.append(np.array([line1, line2]))
        y_data.append(np.array([line3, line4]))
      self.x_data = np.array(x_data)
      self.y_data = np.array(y_data)

    def get_rhyme(self, line: list[str]) -> str:
      last_word = re.sub('\W+', '', line[-1])
      all_rhymes = pronouncing.rhymes(last_word)
      if all_rhymes:
          rhyming_ends = [x[-2:] for x in all_rhymes]
          most_common_rhyme = max(set(rhyming_ends), key=rhyming_ends.count)
      else:
          most_common_rhyme = last_word[-2:]
      return most_common_rhyme

    def normalize_word_line(self, line: str) -> list[str]:
      row = [x.lower() for x in re.findall(r"\w+'?\w*", line)]
      new_row = []
      for word in row:
        numbers = re.findall(r"\d+", word)
        for n in numbers:
          word = word.replace(n, num2words(int(n)))
        new_row.append(word)
      return new_row

    def get_rhyme_list(self, normalized_lyrics: list[list[str]]):
      rhyme_set = set()
      for row in normalized_lyrics:
        most_common_rhyme = get_rhyme(row)
        rhyme_set.add(most_common_rhyme)

      sorted_rhyme_set = sorted(list(rhyme_set), key=lambda x: x[-1])
      return sorted_rhyme_set

    def compose(self, starting_input, n_line_groups):
        final_vectors = []
        starting_vectors = self.lstm_model.predict(starting_input).flatten().reshape(1, 2, 2)
        final_vectors.append(starting_vectors)
        for i in range(n_line_groups):
            prev_vectors = final_vectors[-1]
            final_vectors.append(self.lstm_model.predict(prev_vectors).flatten().reshape(1, 2, 2))
        return final_vectors

    def last_word_compare(self, prev_lines, new_line, penalty=0.2):
        sum_penalty = 0.0
        for line in prev_lines:
            if line[-1] == new_line[-1]:
                sum_penalty += penalty
        return sum_penalty

    def calculate_score(self, features, n_syllables, rhyme, penalty):
        desired_n_syllables = features[0] * self.maxsyllables
        desired_rhyme = features[1] * len(self.rhyme_list)
        syllable_score = -abs(desired_n_syllables - n_syllables)
        rhyme_score = 2.0 * abs(desired_rhyme - rhyme)
        return 1.0 + syllable_score + rhyme_score - penalty

    def vectors_into_song(self, vectors, generated_lyrics):
        song = []
        generated_features = [get_line_features(x, self.rhyme_list) for x in generated_lyrics]
        vector_halves = []
        for vector in vectors:
            vector_halves.extend(vector[0].tolist())
        for vector in vector_halves:
            scorelist = []
            for (line, n_syllables, rhyme) in generated_features:
                penalty = self.last_word_compare(song, line) if song else 0
                total_score = self.calculate_score(vector, n_syllables, rhyme, penalty)
                scorelist.append([line, total_score])
            best_line_index = np.argmax([float(x[1]) for x in scorelist])
            best_line = scorelist[best_line_index][0]
            song.append(best_line)
            generated_features = [x for x in generated_features if x[0] != best_line]
        return [' '.join(x) for x in song]

    def get_random_lines(self, markov_model, n_rows: int) -> list[list[str]]:
      lines = []
      last_words = []

      while len(lines) < n_rows:
        line = markov_model.make_short_sentence(max_chars=40)
        # nenorime gauti tuščios eilutės ar jau turimos eilutės
        if (line is not None) and (line not in lines):
          last_word = self.normalize_word_line(line)[-1]
          # nenorime kad dažnai pasikartotų tas pats žodis eilutės gale
          if last_words.count(last_word) < 3:
            lines.append(self.normalize_word_line(line))
            last_words.append(last_word)

      return lines

    def generate_song(self, start, num_lines=200, n_line_groups=4):
        vectors = self.compose(start, n_line_groups)
        some_lyrics = self.get_random_lines(self.markov_model, num_lines)
        return self.vectors_into_song(vectors, some_lyrics)


Sudedame viską į vientą vietą:

In [25]:
from google.colab import drive
drive.mount('/content/drive')

model = load_model('/content/drive/MyDrive/Colab/NLP_ND/LSTM-based_lyrics_generator.keras')

artist_files = ['nursery_rhymes.txt', 'drake.txt']
raw_lyrics = ""
for file in artist_files:
  with open(os.path.join(datapath, file), 'r') as f:
      contents = f.read()
      raw_lyrics += contents + "\n"

markov_model = markovify.NewlineText(raw_lyrics)

lyrics = [normalize_word_line(x) for x in raw_lyrics.splitlines()]
lyrics = [x for x in lyrics if x]
generator = LyricsGenerator(model, markov_model, lyrics)
generator.generate_song()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
x_data, y_data = build_dataset(lyrics, generator.rhyme_list)
start_i = np.random.choice(range(len(x_data)))
start = np.array([x_data[start_i]])
start

array([[[8.        , 0.87684729],
        [8.        , 0.4679803 ]]])

In [16]:
artist_files = ['nursery_rhymes.txt', 'drake.txt']
raw_lyrics = ""
for file in artist_files:
  with open(os.path.join(datapath, file), 'r') as f:
      contents = f.read()
      raw_lyrics += contents + "\n"

markov_model = markovify.NewlineText(raw_lyrics)

lyrics = [normalize_word_line(x) for x in raw_lyrics.splitlines()]
lyrics = [x for x in lyrics if x]
rhymes = get_rhyme_list(lyrics)

x_data, y_data = build_dataset(lyrics, rhymes)

model = create_lstm(depth=4)
print(model.summary())

model.fit(
    x_data, y_data,
    batch_size=16,
    epochs=10
)

from google.colab import drive
drive.mount('/content/drive')

model.save('/content/drive/MyDrive/Colab/LSTM-based_lyrics_generator.keras')


start_i = np.random.choice(range(len(x_data)))
start = np.array([x_data[start_i]])
vectors = compose(start, model, 4)
some_lyrics = get_random_lines(markov_model, 2000)
vectors_into_song(vectors, some_lyrics, rhymes, maxsyllables=8)

None
Epoch 1/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 47.6122
Epoch 2/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 44.1683
Epoch 3/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 44.1670
Epoch 4/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 44.1669
Epoch 5/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 44.1669
Epoch 6/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 44.1669
Epoch 7/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 44.1669
Epoch 8/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 44.1669
Epoch 9/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 44.1669
Epoch 10/10
[1m483/483[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0

['instead of the judah',
 'you know what it was in the mia',
 'put her in the mafia',
 'the hare she loves the high wood',
 'felt like the titanic',
 'i try not to give to the hood',
 'couple days on the music',
 'the things you say you love so bad',
 "and i ain't asthmatic",
 'this is more than i ever had']