In [1]:
import numpy
import sys
import torch
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == 'cuda':
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed_all(SEED)
print(device)

cpu


In [10]:
local = True
if not local:
    file = open("../input/lyrics-with-sentiment/positive_pop.txt").read()
else:
    file = open("data/positive_pop.txt", encoding='utf8').read()

In [11]:
import string 
    
# Storing the sets of punctuation,
# digits, ascii_letters and whitespace
# in variable result 
result = string.ascii_lowercase + '\',. !?Ж' # characters we want to keep

def check_chars(token):
    return all([ch in result for ch in token])

print(check_chars('hello'))
print(check_chars('\''))
        

True
True


In [12]:
def tokenize_words(text):
    # lowercase everything to standardize it
    text = text.lower()
    text = text.replace('\n', ' Ж ')

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
    tokens = tokenizer.tokenize(text)

    return " ".join([t for t in tokens if check_chars(t)])

In [13]:
processed_inputs = tokenize_words(file) # read data
print(len(processed_inputs))

19167836


In [14]:
processed_inputs = processed_inputs[:2500000] # to test with smaller size

In [17]:
chars = sorted(list(set(processed_inputs))) # list of unique characters
char_to_num = dict((c, i) for i, c in enumerate(chars)) #dictionary with index to word

In [18]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 2500000
Total vocab: 33


In [19]:
seq_length = 30 # legnth of chunk
x_data = []
y_data = []

In [20]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [21]:
n_patterns = len(x_data) # total amount of chunk
print ("Total Patterns:", n_patterns)

Total Patterns: 2499970


In [22]:
#pu tin to right format
X = numpy.reshape(x_data, (n_patterns, seq_length, 1)) 
X = X/float(vocab_len)

In [23]:
y = np_utils.to_categorical(y_data)

In [24]:
#create a LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam') # add loss  and optimizer

In [26]:
#save checkpoints
filepath = "model_weights_saved_pos.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
model.fit(X, y, epochs=30, batch_size=256, callbacks=desired_callbacks) #train model

In [27]:
#read model
filename = "model_weights_saved_pos.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [28]:
num_to_char = dict((i, c) for i, c in enumerate(chars)) # dictionary for characters

In [29]:
start = numpy.random.randint(0, len(x_data) - 1) # pick random chunk for testing
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  wish you kissed like him Ж yo "


In [31]:
transkeys = num_to_char.keys()
rev_dict = {}
for key in transkeys:
    val = num_to_char[key]
    rev_dict[val] = key
def text_to_char(text):
    # helper functgion that transfors text into the right word indexes
    return [rev_dict[char] for char in text]

In [32]:
newl_index = text_to_char('Ж') # add NEWLINE token to text
app_ind = text_to_char('\'')
print(newl_index,app_ind)

[32] [2]


In [42]:
import tensorflow as tf
def top_k_sampling(conditional_probability, target_words, k):
    """
    Top_k sampling made for Ngram word generation. Takes a probability distribution for a 
    ngram and returns one out of the top k most probable words.
    """
    n_conds = len(conditional_probability)
    #conditional_probability = np.array(conditional_probability)
    k = min(n_conds,k)
    top_k_probabilities, top_k_indices= tf.math.top_k(conditional_probability, k=k, sorted=True)
    top_k_indices = numpy.asarray(top_k_indices).astype("int32")
    top_k_redistributed_probability=tf.nn.softmax(numpy.log(top_k_probabilities))
    top_k_redistributed_probability = numpy.asarray(top_k_redistributed_probability).astype("float32")
    sampled_index = numpy.random.choice(top_k_indices, p=top_k_redistributed_probability)
    sampled_token = target_words[sampled_index]
    return sampled_index, sampled_token

In [43]:
def generate_text(pattern, translate_dict, token_count: int, k = 5):
    """
    Function to generate text with top k sampling for a character based lstm model. 
    Works in a similar fashion to the other text generation functions
    """
    init_string = [num_to_char[c] for c in pattern]
    sys.stdout.write(''.join(init_string))
    for _ in range(token_count):
        
        x = numpy.reshape(pattern, (1, len(pattern), 1))
        x = x / float(vocab_len)
        prediction = model.predict(x, verbose=0)
        prediction = prediction.flatten()
        
        index, result = top_k_sampling(prediction, translate_dict, k )
        if index == newl_index:
            result = '\n'
        sys.stdout.write(result)
        pattern.append(index)
        if len(pattern) > 30: # keep max length op chunks to 30 characters
            pattern = pattern[1:len(pattern)]


In [44]:
#example text
generate_text(text_to_char('i want'), num_to_char, 500)

te 
 it 's all around my songs on the sun 
 it 's the reap to turn to the world to give 
 you know , i won 't like it 
 the same , tender 
 will i like it in a stace 
 to tell your loving 
 i still want to say you stopd my hamo 
 i can 't get it all around 
 it 's too more than what you said 
 the street , i 'm not coming , without the backes of mine , 
 so sexy sourd in this way , 
 where 's the story in the ride 
 to stop , save the story that you start 
 i will be start arant 
 and i could g