In [1]:
import collections

import numpy as np


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [2]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
sys.path.append('C:/Users/lov/Documents/New folder (3)/')
import project_tests as tests

In [4]:
filename = 'C:/Users/lov/Documents/nlp_trans/new.xlsx'
import pandas as pd
df=pd.read_excel(filename)
df=df.dropna()
df=df.reset_index()
english_sentences=df['English Translated (write translation with Punctuation)']
chn_sentences=df['Mandarin']

In [6]:
import jieba.posseg as pseg
for i in range(0,chn_sentences.shape[0]):
    words = pseg.cut(chn_sentences[i])
    w1=''
    for w in words:
        w1=w1+' '+w.word
    chn_sentences[i]=w1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chn_sentences[i]=w1


In [7]:
for sample_i in range(5):
    print('English sample {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('French sample {}:  {}\n'.format(sample_i + 1, chn_sentences[sample_i]))

English sample 1:  hello
French sample 1:     喂

English sample 2:  hello, hello what is your name?
French sample 2:     喂   ,       喂   你   叫   什么   名

English sample 3:  ah, where do you live ya?
French sample 3:     啊   ,       住   在   哪里   呀

English sample 4:  I ah,I, I, I live on the campus currently
French sample 4:     (   (   那   我   )   )       啊   ,       我   ,       我           我   就   住   在   学校   现在

English sample 5:  campus ya, campus?
French sample 5:     学校   呀   ,       在   什么   学校



In [8]:
chn_words_counter = collections.Counter([word for sentence in chn_sentences for word in sentence.split()])
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in chn_sentences for word in sentence.split()])))
print('{} unique chn words.'.format(len(chn_words_counter)))
print('10 Most common words in the chn dataset:')
print('"' + '" "'.join(list(zip(*chn_words_counter.most_common(10)))[0]) + '"')
print()
print('{} english words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique english words.'.format(len(english_words_counter)))
print('10 Most common words in the english dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')

36377 English words.
3795 unique chn words.
10 Most common words in the chn dataset:
"," "的" "-" "我" "你" "啊" "是" "嗯" "了" "%"

34979 english words.
4618 unique english words.
10 Most common words in the english dataset:
"the" "you" "I" "is" "to" "a" "that" "of" "it" "in"


In [9]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    # TODO: Implement
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

tests.test_tokenize(tokenize)

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [10]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    return pad_sequences(x, maxlen=length, padding='post')

tests.test_pad(pad)

# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [12]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_chn_sentences, preproc_english_sentences, chn_tokenizer, english_tokenizer =\
    preprocess(chn_sentences, english_sentences)
    
max_chn_sequence_length = preproc_chn_sentences.shape[1]
max_english_sequence_length = preproc_english_sentences.shape[1]
chn_vocab_size = len(chn_tokenizer.word_index)
english_vocab_size = len(english_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max chn sentence length:", max_chn_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("chn vocabulary size:", chn_vocab_size)

Data Preprocessed
Max English sentence length: 42
Max chn sentence length: 37
English vocabulary size: 3213
chn vocabulary size: 3781


In [13]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [14]:
def model1(input_shape, output_sequence_length, chn_vocab_size, english_vocab_size):

    # TODO: Implement

    # Hyperparameters
    learning_rate = 0.005
    
    # TODO: Build the layers
    model = Sequential()
    model.add(Embedding(chn_vocab_size, 128, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(GRU(256, return_sequences=True))    
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(english_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [15]:
def model2(input_shape, output_sequence_length, chn_vocab_size, english_vocab_size):
    """
    Build and train an encoder-decoder model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # OPTIONAL: Implement
    
    # Hyperparameters
    learning_rate = 0.001
    
    # Build the layers    
    model = Sequential()
    # Encoder
    model.add(GRU(256, input_shape=input_shape[1:], go_backwards=True))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(GRU(256, return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(english_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [16]:
def model3(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Implement

    # Hyperparameters
    learning_rate = 0.003
    
    # Build the layers    
    model = Sequential()
    # Embedding
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1],
                         input_shape=input_shape[1:]))
    # Encoder
    model.add(Bidirectional(GRU(128)))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [19]:
tmp_x = pad(preproc_chn_sentences, preproc_english_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2]))

# TODO: Train the neural network
model_1 = model1(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    len(chn_tokenizer.word_index)+1,
    len(english_tokenizer.word_index)+1)

tmp_x = pad(preproc_chn_sentences, preproc_english_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))
model_2 = model2(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    len(chn_tokenizer.word_index)+1,
    len(english_tokenizer.word_index)+1)

tmp_x = pad(preproc_chn_sentences, preproc_english_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2]))
model_3 = model3(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    len(chn_tokenizer.word_index)+1,
    len(english_tokenizer.word_index)+1)

model_1.summary()
model_2.summary()
model_3.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 42, 128)           484096    
_________________________________________________________________
gru (GRU)                    (None, 42, 256)           296448    
_________________________________________________________________
time_distributed (TimeDistri (None, 42, 1024)          263168    
_________________________________________________________________
dropout (Dropout)            (None, 42, 1024)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 42, 3214)          3294350   
Total params: 4,338,062
Trainable params: 4,338,062
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)            

In [20]:
model_1.fit(tmp_x, preproc_english_sentences, batch_size=100, epochs=10, validation_split=0.2)
print(logits_to_text(model_1.predict(tmp_x[:1])[0], english_tokenizer))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
hello <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [22]:
preproc_english_sentences

array([[[  80],
        [   0],
        [   0],
        ...,
        [   0],
        [   0],
        [   0]],

       [[  80],
        [  80],
        [  24],
        ...,
        [   0],
        [   0],
        [   0]],

       [[   5],
        [ 149],
        [  25],
        ...,
        [   0],
        [   0],
        [   0]],

       ...,

       [[  13],
        [1186],
        [   3],
        ...,
        [   0],
        [   0],
        [   0]],

       [[1299],
        [   5],
        [   3],
        ...,
        [   0],
        [   0],
        [   0]],

       [[  10],
        [  23],
        [   0],
        ...,
        [   0],
        [   0],
        [   0]]])

In [19]:
model_3.fit(tmp_x, preproc_english_sentences, batch_size=100, epochs=10, validation_split=0.2)
print(logits_to_text(model_3.predict(tmp_x[:1])[0], english_tokenizer))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
hello <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [96]:
tmp_x = pad(preproc_chn_sentences, preproc_english_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))
model_2.fit(tmp_x, preproc_english_sentences, batch_size=100, epochs=10, validation_split=0.2)
print(logits_to_text(model_2.predict(tmp_x[:1])[0], english_tokenizer))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
oh <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [21]:
def chrgen(noc,an,chn_vocab_size):
    ga=np.random.randint(an,size=(noc,1))
    ga1=np.zeros((noc,np.shape(tmp_x[0,:])[0]))
    for i in range(0,noc):
        a=np.random.randint(np.shape(tmp_x[0,:])[0],size=(ga[i][0],1))
        for j in range(0,ga[i][0]):
            ga1[i,j]=np.random.randint(english_vocab_size)
    return ga1        

In [22]:
ga1=chrgen(50,np.shape(tmp_x[0,:])[0],chn_vocab_size)

In [55]:
def predict_word(model,ga):
    wn=''
    for i in range(0,np.shape(model.predict(ga))[0]):
        if logits_to_text(model.predict(ga)[i], english_tokenizer)=='<PAD>':
            wn=wn
        else:  
            wn=wn+' '+logits_to_text(model.predict(ga)[i], english_tokenizer)
    unique_words = dict.fromkeys(wn.split()) 
    wn=' '.join(unique_words)
    return wn        