In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import GRU
from tensorflow.keras.optimizers import Adam
import numpy as np
import random
import sys
import multiprocessing
import requests

In [3]:
import urllib.request
import os

In [7]:
# download the source file
url = "https://raw.githubusercontent.com/torvalds/linux/master/kernel/sched/core.c"
filename = "core.c"
if not os.path.exists(filename):
    urllib.request.urlretrieve(url, filename)

In [9]:
text = requests.get(url).text.lower()
print("corpus length:", len(text))

corpus length: 296004


In [10]:
def preprocess(text):
    chars = set(text)
    print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    
    # cut the text in semi-redundant sequences of maxlen characters
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    print("nb sequences:", len(sentences))
    
    print("Vectorization...")
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
        
    return X, y, char_indices, indices_char, len(chars)

In [11]:
def get_model(distinct_chars):
    # build the model: 2 stacked GRU
    print("Build model...")
    xi = Input((maxlen, distinct_chars))
    x = GRU(256, return_sequences=True)(xi)
    x = Dropout(0.2)(x)
    x = GRU(256, return_sequences=False)(x)
    x = Dropout(0.2)(x)
    x = Dense(distinct_chars)(x)
    x = Activation("softmax")(x)

    model = Model(inputs=xi, outputs=x)

    model.summary()

    adam = Adam(0.0003)

    model.compile(loss="categorical_crossentropy", optimizer=adam)
    
    return model

In [12]:
def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    a = (np.log(a + 1e-8) / temperature).astype(np.float64)
    a = np.exp(a) / np.sum(np.exp(a))
    try:
        sample_result = np.argmax(np.random.multinomial(1, a, 1))
    except ValueError:
        error = 1.0 - np.sum(a)
        a[0] += error
        sample_result = np.argmax(np.random.multinomial(1, a, 1))
    return sample_result

In [13]:
def generate_html(model, X, y, char_indices, indices_char, distinct_chars):
    # train the model, output generated text after each iteration
#     for iteration in range(1, 2):
    for iteration in range(1, 20):
        print()
        print("-" * 50)
        print("Iteration", iteration)

        model.fit(X, y, batch_size=64, epochs=4,
                  workers=(multiprocessing.cpu_count() - 1), use_multiprocessing=True)
        model.save_weights("html_weights.hdf5")

        start_index = random.randint(0, len(text) - maxlen - 1)

        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print()
            print("----- diversity:", diversity)

            generated = ""
            sentence = text[start_index: start_index + maxlen] # Pick a random sentence
            generated += sentence

            print("----- Generating with seed: '" + sentence + "'")
            sys.stdout.write(generated)

            for _ in range(200):
                x = np.zeros((1, maxlen, distinct_chars))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                # predict next char
                preds = model.predict(x, verbose=0,
                                      workers=(multiprocessing.cpu_count() - 1),
                                      use_multiprocessing=True)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                # full sentence being generated
                generated += next_char

                # shift sentence
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()

            print()

In [15]:
# cut the text into sequences of a fixed length
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

print("number of sequences:", len(sentences))

number of sequences: 98655


In [16]:
X, y, char_indices, indices_char, distinct_chars = preprocess(text)
model = get_model(distinct_chars)

total chars: 70
nb sequences: 98655
Vectorization...


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(chars)), dtype=np.bool)


Build model...
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 40, 70)]          0         
                                                                 
 gru (GRU)                   (None, 40, 256)           251904    
                                                                 
 dropout (Dropout)           (None, 40, 256)           0         
                                                                 
 gru_1 (GRU)                 (None, 256)               394752    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense (Dense)               (None, 70)                17990     
                                                                 
 activation (Activation)     (None, 70)       

In [None]:
generate_html(model, X, y, char_indices, indices_char, distinct_chars)


--------------------------------------------------
Iteration 1
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

----- diversity: 0.2
----- Generating with seed: 'hz_full */
static inline void sched_tick'
hz_full */
static inline void sched_tick_resched_core_task() {
																																																																																																																																																																																	

----- diversity: 0.5
----- Generating with seed: 'hz_full */
static inline void sched_tick'
hz_full */
static inline void sched_tick(struct task_group(struct rq *rq);
	return ret;
	}

	return 0;

	if (struct rq *rq, nutt *task_rq_lock() {
						                                                                                       

----- diversity: 1.0
----- Generating with seed: 'hz_full */
static inline void sched_tick'
hz_full */
static inline void sched_tick_inlocily_abcow#en()
		 * mab} gisers = ketnpemend tise_sel _statt
	.
 */
	return an_fr(sched_b