In [1]:
import tensorflow as tf, numpy as np
import random
import pandas as pd
import nltk

## Restore the data in pickle

In [2]:
import pickle
with open('simple2_save.pickle', 'rb') as f:
    save = pickle.load(f)
    adj_pos = save['adj_pos'][:100]
    adj_neg = save['adj_neg'][:100]
    adverb_no_change = save['adverb_no_change']
    adverb_change = save['adverb_change']
    adverb_neg = save['adverb_neg']
    adverb_pos = save['adverb_pos'] 
    dico_embedding = save['dico_embedding'] 
adjectives = [adj_neg, adj_pos]
adverbs = [adverb_no_change, adverb_change, adverb_neg, adverb_pos]

#### Data parsing and cleaning

In [122]:
all_positive = 'positive-words.txt'
with open(all_positive, 'r') as f:
    all_positive = f.read().splitlines()
print('pos contient %d mots' % len(all_positive))
all_positive = [p for p in all_positive if len(p) and not p.startswith(';')]
print('pos contient %d mots' % len(all_positive))

pos contient 2041 mots
pos contient 2006 mots


#### Restriction to the strict adjectives

In [123]:
pos = [p for p in all_positive if nltk.pos_tag([p])[0][1] in ['JJ']] # Pos for Part Of Speech
print('pos contient %d mots' % len(pos))

pos contient 365 mots


#### Same with negative words

In [124]:
all_negative = 'negative-words.txt'
with open(all_negative, 'r') as f:
    all_negative = []
    for _ in range(4000):
        try:
            all_negative.append(f.readline())
        except UnicodeDecodeError:
            pass
        
print('neg contient %d mots' % len(all_negative))
all_negative = [n.strip('\n') for n in all_negative if not n.startswith(';')]
all_negative = [n for n in all_negative if len(n)]
print('neg contient %d mots' % len(all_negative))

neg contient 3999 mots
neg contient 3927 mots


In [125]:
neg = [n for n in all_negative if nltk.pos_tag([n])[0][1] == 'JJ' and 
                          not n.endswith('ly') and '-' not in n]
print('neg contient %d mots' % len(neg))

neg contient 460 mots


In [126]:
adjectives = [neg, pos]

### Ajout de mots qui meublent, qui changent le sens de la phrase ou non

In [127]:
adverb_change = ['never', 'not', 'rarely', 'hardly', 'barely']
adverb_no_change = ['always', 'sometimes', 'just', 'often', 'usually']
adverb_neg = ['rudely', 'unfortunately', 'arrogantly', 'cruelly', 'deceitfully']
adverb_pos = ['beautifully', 'awesomely', 'happily', 'brightly', 'elegantly']
adverbs = [adverb_no_change, adverb_change, adverb_neg, adverb_pos]

### Gather all the words, get them in a dense representation

In [143]:
words_used = set(['this', 'is'] + neg + pos + adverb_neg + adverb_pos + adverb_change
                 + adverb_no_change)
dico_embedding = {}

In [144]:
with open('../glove.6B.50d.txt', 'r') as f:
    for i, line in enumerate(f):
        word, *vector = line.split()
        if word in words_used:
            dico_embedding[word] = vector
    'len(dico_embedding) = %d' % len(dico_embedding)
    'len(words_used) = %d' % len(words_used)
    'The words %s are not in the corpus' % (words_used - set(dico_embedding.keys()))

to_get_rid_of = words_used - set(dico_embedding.keys())
dico_embedding['This'] = dico_embedding['this']

'len(dico_embedding) = 804'

'len(words_used) = 804'

'The words set() are not in the corpus'

In [145]:
data_names = ['pos', 'neg', 'adverb_pos', 'adverb_neg', 'adverb_change', 'adverb_no_change']
for dn in data_names:
    data = eval(dn)
    print('Longueur de %s AVANT élimination des mots rares : %d' % (dn, len(data)))
    data = list(set(data) - to_get_rid_of)
    print('Longueur de %s APRES : %d' % (dn, len(data)))

Longueur de pos AVANT élimination des mots rares : 343
Longueur de pos APRES : 343
Longueur de neg AVANT élimination des mots rares : 440
Longueur de neg APRES : 440
Longueur de adverb_pos AVANT élimination des mots rares : 5
Longueur de adverb_pos APRES : 5
Longueur de adverb_neg AVANT élimination des mots rares : 5
Longueur de adverb_neg APRES : 5
Longueur de adverb_change AVANT élimination des mots rares : 5
Longueur de adverb_change APRES : 5
Longueur de adverb_no_change AVANT élimination des mots rares : 5
Longueur de adverb_no_change APRES : 5


## Save the data in pickle

In [146]:
import pickle
with open('simple2_save.pickle', 'wb') as f:
    pickle.dump({'adj_pos': pos, 
                 'adj_neg': neg,
                 'adverb_no_change': adverb_no_change, 
                 'adverb_change': adverb_change,
                 'adverb_neg': adverb_neg,
                 'adverb_pos': adverb_pos, 
                 'dico_embedding': dico_embedding}, 
                f)

## Restore the data in pickle

In [129]:
import pickle
with open('simple2_save.pickle', 'rb') as f:
    save = pickle.load(f)
    adj_pos = save['adj_pos']
    adj_neg = save['adj_neg']
    adverb_no_change = save['adverb_no_change']
    adverb_change = save['adverb_change']
    adverb_neg = save['adverb_neg']
    adverb_pos = save['adverb_pos'] 
    dico_embedding = save['dico_embedding'] 
adjectives = [adj_neg, adj_pos]
adverbs = [adverb_no_change, adverb_change, adverb_neg, adverb_pos]

In [148]:
for data_name in save.keys():
    print("Longueur de %s : %d" % (data_name, len(eval(data_name))))

Longueur de adverb_no_change : 5
Longueur de dico_embedding : 805
Longueur de adj_pos : 100
Longueur de adverb_neg : 5
Longueur de adj_neg : 100
Longueur de adverb_pos : 5
Longueur de adverb_change : 5


### Routines pour générer les données 

In [11]:
class Example_Generator(object):
    
    def __init__(self, adjectives, adverbs):
        self.adverbs = adverbs
        n = min(len(adjs) for adjs in adjectives)
        permut = random.sample(range(n), 5*n//6)
        self.training_adjectives = [[adjs[i] for i in permut] for adjs in adjectives]
        self.validation_adjectives = [[adjs[i] for i in range(n) if i not in permut] 
                                 for adjs in adjectives]
        self.adjectives = adjectives        
        
    def generate_example(self):
        adv_ind = random.randint(0,3)
        adv = random.choice(self.adverbs[adv_ind])
        if adv_ind in [0, 2]: # on peut choisir parmi tous les adjectifs
            adj_ind = random.randint(0,1)
            adj = random.choice(self.adjectives[adj_ind])
        else: # adv_ind in [1, 3], on ne choisit que parmi les adjectifs de training
            adj_ind = random.randint(0,1)
            adj = random.choice(self.training_adjectives[adj_ind])
         
        sentence = ['This', 'is', adv, adj]
        label = int(adv_ind == 0 and adj_ind or # on NE change PAS le sentiment de l'adjectif
                    adv_ind == 1 and not adj_ind or # on change le sentiment de l'adjectif
                    adv_ind == 2 and 0 or # l'adverbe impose le sentiment négatif
                    adv_ind == 3) # l'adverbe impose le sentiment positif 
        return sentence, label
    
    def generate_validation(self):
        for i, adjs in enumerate(self.validation_adjectives):
            for adj in adjs:
                for adv in self.adverbs[1]:
                    label = int(not i)
                    yield ['This', 'is', adv, adj], label
                for adv in self.adverbs[3]:
                    label = 1
                    yield ['This', 'is', adv, adj], label

### Generate the batch

In [12]:
def generate_batch(example_generator, batch_size, size_embedding=50):
    x = np.empty([4, batch_size, size_embedding])
    y = np.empty([1, batch_size, 1])
    
    for j in range(batch_size):
        sentence_split, label = example_generator.generate_example()
        
        for i, w in enumerate(sentence_split):
            x[i,j,:] = dico_embedding[w]
            
        y[0,j,0] = label
    return x, y

def generate_validation_set(example_generator, size_embedding=50, validation_size=None):
    max_size = (2*len(example_generator.validation_adjectives[0])*
                2*len(adverbs[0]))
    if not validation_size or validation_size > max_size: 
        validation_adjectives = max_size
    x = np.empty([4, validation_size, size_embedding])
    y = np.empty([1, validation_size, 1])
    
    gen = example_generator.generate_validation()
    for j in range(validation_size):
        sentence_split, label = next(gen)
        
        for i, w in enumerate(sentence_split):
            x[i,j,:] = dico_embedding[w]
            
        y[0,j,0] = label
    return x, y

# Graph building and training

In [13]:
INPUT_SIZE = 50
OUTPUT_SIZE = 1
RNN_HIDDEN = 100
LEARNING_RATE = 0.01

DROPOUT = 0

TINY = 1e-7


NUM_EPOCHS = 500
ITERATONS_PER_EPOCH = 20
BATCH_SIZE = 300

In [14]:
#####################################################################
#############             Graph Definition             ##############
#####################################################################


with tf.Graph().as_default() as graph:
    # Definition of the inputs and outputs
    inputs = tf.placeholder(tf.float32, (None, None, INPUT_SIZE))
    labels = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE))
    
    cell = tf.contrib.rnn.BasicLSTMCell(RNN_HIDDEN)
    
    # maybe add dropout
    if DROPOUT:
        cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=0.5, output_keep_prob=0.8)
    
    # Definition of the initial state
    batch_size = tf.shape(inputs)[1]
    initial_state = cell.zero_state(batch_size, tf.float32)

    # Computation of the outputs and states
    with tf.variable_scope('lstm_weights'):
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, 
                                                    initial_state=initial_state, 
                                                    time_major=True)
        ## First by taking only the last step
        final_outputs = tf.slice(rnn_outputs, begin=(3,0,0), size=(1,-1,RNN_HIDDEN))
        ## Then by averaging on all states
        final_outputs = tf.reduce_mean(rnn_outputs, axis=0, keep_dims=True)
        _ = tf.summary.histogram('hidden_state', rnn_states)
        

    # Projection of the outputs
    final_projection = lambda x: tf.contrib.layers.linear(x, num_outputs=OUTPUT_SIZE, 
                                                          activation_fn=tf.nn.sigmoid)
    # Application of final projection to the outputs
    logits = tf.map_fn(final_projection, final_outputs)

    # Loss
    loss = -(labels*tf.log(logits + TINY) + (1.0 - labels)*tf.log(1.0 - logits + TINY))
    loss = tf.reduce_mean(loss)

    # train_optimizer
    train_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

    # For validation purpose
    accuracy = tf.reduce_mean(tf.cast(abs(logits - labels) < 0.5, tf.float32))
    
    # Summaries
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter('/home/louis/python/notebooks/.tensorflow_logs_dir/', 
                                   graph=graph)

In [15]:
###########################################################################
########                         Training Loop                     ########
###########################################################################

eg = Example_Generator(adjectives, adverbs)
valid_x, valid_y = generate_validation_set(eg, validation_size=300)

with tf.Session(graph=graph) as session:
    session.run(tf.global_variables_initializer())
    for i in range(NUM_EPOCHS):
        epoch_loss = 0
        for j in range(ITERATONS_PER_EPOCH):
            x, y = generate_batch(eg, batch_size=BATCH_SIZE)
            epoch_loss, _, summaries = session.run([loss, train_optimizer, merged], 
                                                   feed_dict={inputs:x, 
                                                              labels:y})
            # Summaries
            ind = i*ITERATONS_PER_EPOCH + j
            if ind%10 == 0:
                writer.add_summary(summaries, ind)
        epoch_loss /= ITERATONS_PER_EPOCH
        valid_accuracy = session.run(accuracy, 
                                     feed_dict={inputs:valid_x, labels:valid_y})
        print('Iteration : %d, Epoch Loss = %.8f' % (i, epoch_loss))
        print('Accuracy = %.1f' % (valid_accuracy*100.))
        if valid_accuracy == 1 or i == 30: break

Iteration : 0, Epoch Loss = 0.02386198
Accuracy = 64.0
Iteration : 1, Epoch Loss = 0.01795882
Accuracy = 63.7
Iteration : 2, Epoch Loss = 0.01659119
Accuracy = 76.0
Iteration : 3, Epoch Loss = 0.01122537
Accuracy = 84.7
Iteration : 4, Epoch Loss = 0.00750723
Accuracy = 95.0
Iteration : 5, Epoch Loss = 0.00611988
Accuracy = 95.0
Iteration : 6, Epoch Loss = 0.00372319
Accuracy = 96.7
Iteration : 7, Epoch Loss = 0.00362951
Accuracy = 96.7
Iteration : 8, Epoch Loss = 0.00270680
Accuracy = 96.7
Iteration : 9, Epoch Loss = 0.00281181
Accuracy = 96.7
Iteration : 10, Epoch Loss = 0.00196158
Accuracy = 96.7
Iteration : 11, Epoch Loss = 0.00155939
Accuracy = 96.7
Iteration : 12, Epoch Loss = 0.00118558
Accuracy = 97.0
Iteration : 13, Epoch Loss = 0.00157578
Accuracy = 96.7
Iteration : 14, Epoch Loss = 0.00076268
Accuracy = 96.7


KeyboardInterrupt: 