# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* the pdf with your answers
* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [2]:
# Python 3.6 or above is required
from collections import defaultdict
import gzip
import numpy as np
from pathlib import Path
import urllib
import tqdm
import time
import sklearn

In [3]:
PATH_TO_DATA = Path('data/')
# Download word vectors, might take a few minutes and about ~3GB of storage space
en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz'
if not en_embeddings_path.exists():
    urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz', en_embeddings_path)
fr_embeddings_path = PATH_TO_DATA / 'cc.fr.300.vec.gz'
if not fr_embeddings_path.exists():
    urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz', fr_embeddings_path)

# 1) Monolingual (English) word embeddings 

In [4]:
class Word2Vec():

    def __init__(self, filepath, vocab_size=50000):
        self.words, self.embeddings = self.load_wordvec(filepath, vocab_size)
        # Mappings for O(1) retrieval:
        self.word2id = {word: idx for idx, word in enumerate(self.words)}
        self.id2word = {idx: word for idx, word in enumerate(self.words)}
    
    def load_wordvec(self, filepath, vocab_size):
        assert str(filepath).endswith('.gz')
        words = []
        embeddings = []
        with gzip.open(filepath, 'rt') as f:  # Read compressed file directly
            next(f)  # Skip header
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                words.append(word)
                embeddings.append(np.fromstring(vec, sep=' '))
                if i == (vocab_size - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(words)))
        return words, np.vstack(embeddings)
    
    def encode(self, word):
        # Returns the 1D embedding of a given word
        word_id = self.word2id[word]
        return self.embeddings[word_id]
    
    def score(self, word1, word2):
        # Return the cosine similarity: use np.dot & np.linalg.norm
        word1_embedding = self.encode(word1)
        word2_embedding = self.encode(word2)
        cosine_dist = 1 - np.dot(word1_embedding,word2_embedding)/(np.linalg.norm(word1_embedding)*np.linalg.norm(word2_embedding))
        return cosine_dist
    
    def most_similar(self, word, k=5):
        # Returns the k most similar words: self.score & np.argsort
        distances = np.array([self.score(word, other_word) for other_word in self.words])
        distances_int = np.argsort(distances)
        neighbors = []
        for rank in range(k+1):
            neighbor_id = distances_int[rank]
            neighbor_wrd = self.id2word[neighbor_id]
            if (not neighbor_wrd==word) and (len(neighbors)!=k):
                # the word itself isn't considered a neighbor
                # a situation might occur in which k neighbors embeddings are colinear to the word embedding
                # thus, the word itself might not appear at first in the top k neighbors
                neighbors.append(neighbor_wrd)
        return neighbors

In [5]:
word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)

# You will be evaluated on the output of the following:
for word1, word2 in zip(('cat', 'cat', 'cat', 'Paris', 'Paris', 'Paris', 'Paris'), ('tree', 'dog', 'pet', 'France', 'Germany', 'baguette', 'donut')):
    print(word1, word2, word2vec.score(word1, word2))
for word in ['cat', 'dog', 'dogs', 'Paris', 'Germany']:
    print(word2vec.most_similar(word))

Loaded 50000 pretrained word vectors
cat tree 0.7355024533834524
cat dog 0.2921358701457436
cat pet 0.32466866400236194
Paris France 0.31070410741934595
Paris Germany 0.5948757713262451
Paris baguette 0.7060004172219778
Paris donut 1.006588507552348
['cats', 'kitty', 'kitten', 'feline', 'dog']
['dogs', 'puppy', 'pup', 'canine', 'pet']
['dog', 'cats', 'puppies', 'Dogs', 'pets']
['France', 'Parisian', 'Marseille', 'Brussels', 'Strasbourg']
['Austria', 'Europe', 'Berlin', 'Hamburg', 'Bavaria']


In [69]:
class BagOfWords():
    
    def __init__(self, word2vec):
        self.word2vec = word2vec
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        # -> idf = {word: idf_value, ...}
        sentences_nb = len(sentences)
        idf = {word:0 for word in self.word2vec.words}
        for stc in sentences:
            stc_wds = stc.split(' ')
            stc_wds.remove('')
            for word in np.unique(np.array(stc_wds)):
                try:
                    idf[word]+=1
                except:
                    pass # because this means that the word isn't in the known words
        for word in self.word2vec.words:
            try:
                idf[word] = np.log(sentences_nb / idf[word]) if idf[word] > 0 else np.inf
            except:
                pass # because this means that the word isn't in the known words
            
        return idf
    
    def encode(self, sentence, idf=None):
        # Takes a sentence as input, returns the sentence embedding
        sentence = sentence.split(' ')
        try:
            sentence.remove('')
        except:
            pass
        embed_sentence = []
        for word in sentence: # Here I could save the known encodings in a dictionnary
            try:
                embed_sentence.append(self.word2vec.encode(word))
            except:
                pass # because this means that the word isn't in the known words
        embed_sentence = np.array(embed_sentence)
        if idf is None:
            # mean of word vectors
            return np.mean(embed_sentence, axis=0)
        else:
            # idf-weighted mean of word vectors
            sentence_idf = []
            for word in sentence:
                try:
                    sentence_idf.append(idf[word])
                except:
                    pass # because this means that the word isn't in the known words
            sentence_idf = np.array(sentence_idf).reshape(-1,1)
            return np.sum(np.multiply(embed_sentence,sentence_idf),axis=0)/np.sum(sentence_idf)

    def score(self, sentence1, sentence2, idf=None, encoded=False):
        # cosine similarity: use np.dot & np.linalg.norm 
        if not encoded:
            sentence1_embedding = self.encode(sentence1, idf)
            sentence2_embedding = self.encode(sentence2, idf)
        else:
            sentence1_embedding, sentence2_embedding = sentence1, sentence2
        cosine_dist = 1 - np.dot(sentence1_embedding,sentence2_embedding)/(np.linalg.norm(sentence1_embedding)*np.linalg.norm(sentence2_embedding))
        return cosine_dist
    
    def most_similar(self, sentence, sentences, idf=None, k=5):
        # Return most similar sentences
        query = self.encode(sentence, idf)
        keys = np.vstack([self.encode(stc, idf) for stc in sentences])
        similarities = [self.score(query, other_stc, idf, encoded=True) for other_stc in tqdm.tqdm(keys)]
        similarities_int = np.argsort(similarities)
        neighbors = []
        for rank in range(k+1):
            neighbor_id = similarities_int[rank]
            neighbor_stc = sentences[neighbor_id]
            if (not neighbor_stc==sentence) and (len(neighbors)!=k):
                # the sentence itself isn't considered a neighbor
                # a situation might occur in which k neighbors embeddings are colinear to the sentence embedding
                # thus, the sentence itself might not appear at first in the top k neighbors
                neighbors.append(neighbor_stc)
        return neighbors

In [20]:
word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)
sentence2vec = BagOfWords(word2vec)

# Load sentences in "PATH_TO_DATA/sentences.txt"
filepath = PATH_TO_DATA / 'sentences.txt'
with open(filepath, 'r') as f:
    sentences = [line.strip('\n') for line in f]

# You will be evaluated on the output of the following:
print('\n\tAverage of word embeddings')
sentence1 = sentences[7]
sentence2 = sentences[13]
print(sentence1)
print(sentence2)
print(sentence2vec.score(sentence1, sentence2))
sentence = sentences[10]
similar_sentences = sentence2vec.most_similar(sentence, sentences)  # BagOfWords-mean
print(sentence)
for i, sentence_nghbr in enumerate(similar_sentences):
    print(str(i+1) + ')', sentence_nghbr)

# Build idf scores for each word
idf = sentence2vec.build_idf(sentences)

print('\n\tidf weighted average of word embeddings')
print(sentence1)
print(sentence2)
print(sentence2vec.score(sentence1, sentence2, idf))
similar_sentences = sentence2vec.most_similar(sentence, sentences, idf)  # BagOfWords-idf
print(sentence)
for i, sentence_nghbr in enumerate(similar_sentences):
    print(str(i+1) + ')', sentence_nghbr)

Loaded 50000 pretrained word vectors

	Average of word embeddings
1 man singing and 1 man playing a saxophone in a concert . 
10 people venture out to go crosscountry skiing . 
0.29347793517485266


100%|██████████| 150736/150736 [00:05<00:00, 26256.21it/s]


1 smiling african american boy . 
1) 2 woman dancing while pointing .  0.11918872091755883
2) 5 women and 1 man are smiling for the camera .  0.13189610776220573
3) a small boy following 4 geese .  0.14198493500551967
4) 2 female babies eating chips .  0.14372575224275697
5) a young boy and 2 girls open christmas presents .  0.14384643872782843

	idf weighted average of word embeddings
1 man singing and 1 man playing a saxophone in a concert . 
10 people venture out to go crosscountry skiing . 
0.35992000606536156


100%|██████████| 150736/150736 [00:05<00:00, 27123.59it/s]

1 smiling african american boy . 
1) 1 man singing and 1 man playing a saxophone in a concert .  0.06042886498533295
2) two women and 1 man walking across the street .  0.07802236018566411
3) 3 males and 1 woman enjoying a sporting event  0.08281333893571019
4) 5 women and 1 man are smiling for the camera .  0.0833433555931602
5) 2 guys facing away from camera , 1 girl smiling at camera with blue shirt , 1 guy with a beverage with a jacket on .  0.08696551884189918





# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

In [54]:
class MultilingualWordAligner:
    
    def __init__(self, fr_word2vec, en_word2vec):
        self.fr_word2vec = fr_word2vec
        self.en_word2vec = en_word2vec
        self.aligned_fr_embeddings = self.get_aligned_fr_embeddings()
        
    def get_aligned_fr_embeddings(self):
        # 1 - Get words that appear in both vocabs (= identical character strings)
        #     Use it to create the matrix X (emb_dim, vocab_size) and Y (emb_dim, vocab_size) (of embeddings for these words)
        X = Y = []
        for fr_word in self.fr_word2vec.words:
            for en_word in self.en_word2vec.words:
                
                if fr_word==en_word:
                    X.append(self.fr_word2vec.encode(fr_word))
                    Y.append(self.en_word2vec.encode(en_word))
            
        X, Y = np.array(X).transpose(), np.array(Y).transpose()
        assert X.shape[0] == 300 and Y.shape[0] == 300
        
        # 2 - Solve the Procrustes using the numpy package and: np.linalg.svd() and get the optimal W
        #     Now self.fr_word2vec.embeddings * W.transpose() is in the same space as en_word2vec.embeddings
        u, s, vh = np.linalg.svd(np.matmul(Y,X.transpose()))
        W = np.matmul(u, vh)
        assert W.shape == (300, 300)
        return np.matmul(self.fr_word2vec.embeddings, W.transpose())
    
    def get_closest_english_words(self, fr_word, k=3):
        # 3 - Return the top k English nearest neighbors to the input French word
        fr_wrd_id = self.fr_word2vec.word2id[fr_word]
        fr2en_wrd_emb = self.aligned_fr_embeddings[fr_wrd_id]
        scores = 1 - np.divide(np.dot(fr2en_wrd_emb,self.en_word2vec.embeddings.transpose()),
                               np.linalg.norm(fr2en_wrd_emb)*np.linalg.norm(self.en_word2vec.embeddings,axis=1))
        scores_int = np.argsort(scores)
        neighbors = []
        for rank in range(k):
            neighbor_id = scores_int[rank]
            neighbor_wrd = self.en_word2vec.id2word[neighbor_id]
            neighbors.append(neighbor_wrd)
            
        return neighbors

In [55]:
fr_word2vec = Word2Vec(fr_embeddings_path, vocab_size=50000)
en_word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)
multilingual_word_aligner = MultilingualWordAligner(fr_word2vec, en_word2vec)

# You will be evaluated on the output of the following:
fr_words = ['finance', 'mission', 'attention', 'chat', 'chien', 'voiture', 'zut']
k = 10
for fr_word in fr_words:
    print('-' * 10)
    print(f'fr: "{fr_word}"')
    en_words = multilingual_word_aligner.get_closest_english_words(fr_word, k)
    for en_word in en_words:
        print(f'en: "{en_word}"')

Loaded 50000 pretrained word vectors
Loaded 50000 pretrained word vectors
----------
fr: "finance"
en: "cohort"
en: "Carlo"
en: "positives"
en: "referrals"
en: "diligence"
en: "trials"
en: "Crowe"
en: "Leopold"
en: "Rudy"
en: "filings"
----------
fr: "mission"
en: "HomeFollow"
en: "Olympian"
en: "Aussie"
en: "build-up"
en: "Rookie"
en: "STAR"
en: "drop-off"
en: "subtype"
en: "Manly"
en: "defender"
----------
fr: "attention"
en: "humour"
en: "humor"
en: "nationality"
en: "wit"
en: "uniformed"
en: "surrender"
en: "grace"
en: "Ship"
en: "Leisure"
en: "majesty"
----------
fr: "chat"
en: "device"
en: "WISE"
en: "Siding"
en: "Aspire"
en: "Seagate"
en: "Prairie"
en: "Forrester"
en: "Petition"
en: "Fitbit"
en: "NIH"
----------
fr: "chien"
en: "Nest"
en: "BlackBerry"
en: "Smartphones"
en: "device"
en: "carrier"
en: "devices"
en: "pacemaker"
en: "Baton"
en: "Networks"
en: "Blackberry"
----------
fr: "voiture"
en: "Active"
en: "CASH"
en: "T3"
en: "SAFE"
en: "inactive"
en: "deployments"
en: "HMS"


If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

In [213]:
# 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
#     (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)
train_filepath = PATH_TO_DATA / 'SST/stsa.fine.train'
dev_filepath = PATH_TO_DATA / 'SST/stsa.fine.dev'
test_filepath = PATH_TO_DATA / 'SST/stsa.fine.test.X'

# Load training sentences
train_sentences = []
train_labels = []
with open(train_filepath, 'r') as f:
    for line in f:
        data = line.strip('\n')
        train_labels.append(data[0])
        train_sentences.append(data[2:])
        
# Load dev sentences
dev_sentences = []
dev_labels = []
with open(dev_filepath, 'r') as f:
    for line in f:
        data = line.strip('\n')
        dev_labels.append(data[0])
        dev_sentences.append(data[2:])
        
# Load test sentences
test_sentences = []
test_labels = []
with open(test_filepath, 'r') as f:
    for line in f:
        data = line.strip('\n')
        test_sentences.append(data)

In [216]:
# 2 - Encode sentences with the BoV model above
word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)
sentence2vec = BagOfWords(word2vec)
    
idf = None
#idf = sentence2vec.build_idf(sentences)

### Training
train_embedding = []
for stc in train_sentences:
    train_embedding.append(sentence2vec.encode(stc, idf))  # BagOfWords-idf

train_sentences_embedding = np.array(train_embedding)
train_labels = np.array(train_labels)

### Dev
dev_embedding = []
for stc in dev_sentences:
    dev_embedding.append(sentence2vec.encode(stc, idf))  # BagOfWords-idf

dev_sentences_embedding = np.array(dev_embedding)
dev_labels = np.array(dev_labels)

### Test
test_embedding = []
for stc in test_sentences:
    test_embedding.append(sentence2vec.encode(stc, idf))  # BagOfWords-idf

test_sentences_embedding = np.array(test_embedding)


### Let's have a first glance at the labels distributions in each dataset

#print('-'*50, 'Training Set')
#for occ in np.unique(train_labels, return_counts=True)[1]:
#    print(occ/len(train_labels))
    
#print('-'*50, 'Dev Set')
#for occ in np.unique(dev_labels, return_counts=True)[1]:
#    print(occ/len(dev_labels))

# The distributions are very close to each other

Loaded 50000 pretrained word vectors


In [92]:
# 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
#     (consider tuning the L2 regularization on the dev set)
#     In the paper, the accuracy for average of word vectors is 32.7%
#     (VecAvg, table 1, https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)

from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

log_reg = RidgeClassifier(alpha=1.0, fit_intercept=True, max_iter=None, tol=0.001)
log_reg.fit(train_sentences_embedding,train_labels)
dev_pred_labels = log_reg.predict(dev_sentences_embedding)
accuracy = accuracy_score(dev_labels, dev_pred_labels)
print(accuracy)

Accuracy = cross_val_score(log_reg, train_sentences_embedding, train_labels,
                           cv=6,scoring='accuracy')
print(Accuracy)

# TO DO : GRID SEARCH (or golden section) and ERROR ANALYSIS
# Attention à ne pas overfitter sur le validation set avec la cross val !!
# Apply PCA

0.39691189827429607
[0.41684211 0.41333333 0.39929825 0.41263158 0.41983122 0.39592124]


In [86]:
# 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).
#     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
#     You will be evaluated on the results of the test set.

test_pred_labels = log_reg.predict(test_sentences_embedding)
with open(r'test.txt', 'w') as f:
    f.write("\n".join(" ".join(map(str, pred)) for pred in test_pred_labels))

In [95]:
# BONUS!
# 5 - Try to improve performance with another classifier
#     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score#roc_auc_score

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_leaf=5),
                                   n_estimators=100,algorithm="SAMME.R", learning_rate=0.2)

fancy_accuracy = cross_val_score(ada_clf, train_sentences_embedding, train_labels,
                           cv=6,scoring='accuracy')
print(fancy_accuracy, np.mean(fancy_accuracy))

# TO DO : GRID SEARCH (or golden section) and ERROR ANALYSIS

[0.37684211 0.37614035 0.36701754 0.37263158 0.38396624 0.37271449] 0.3748853850519407


# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [97]:
import tensorflow as tf

In [190]:
# 1 - Using the same dataset, transform text to integers using tf.keras.preprocessing.text.one_hot function
#     https://keras.io/preprocessing/text/

text = ''
for stc in train_sentences:
    text += stc + ' '
one_hot_dimension = len(set(tf.keras.preprocessing.text.text_to_word_sequence(text)))
one_hot_dimension = round(one_hot_dimension)#*1.3) # Hashing trick might collide words
encoding = tf.keras.preprocessing.text.one_hot(text, one_hot_dimension)

### Create the training set
train_encoded_data = []
old_nb_words = 0
for stc in train_sentences:
    nb_words = len(tf.keras.preprocessing.text.text_to_word_sequence(stc))
    train_encoded_data.append(encoding[old_nb_words:old_nb_words+nb_words])
    old_nb_words += nb_words
    
### Create the dev and test sets
# First, create a mapping of the words to their encoded value
mapping = {}
for wrd_idx, word in enumerate(tf.keras.preprocessing.text.text_to_word_sequence(text)):
    mapping[word] = encoding[wrd_idx]

# Dev set
dev_encoded_data = []
for stc in dev_sentences:
    local_encoding = []
    for word in tf.keras.preprocessing.text.text_to_word_sequence(stc):
        try:
            local_encoding.append(mapping[word])
        except:
            pass
    dev_encoded_data.append(local_encoding)
    
# Test set
test_encoded_data = []
for stc in test_sentences:
    local_encoding = []
    for word in tf.keras.preprocessing.text.text_to_word_sequence(stc):
        try:
            local_encoding.append(mapping[word])
        except:
            pass
    test_encoded_data.append(local_encoding)

**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

In [197]:
# 2 - Pad your sequences using tf.keras.preprocessing.sequence.pad_sequences
#     https://keras.io/preprocessing/sequence/

# padding='pre' or 'post', whether to add zeros at the beginning or at the end of the sequence
# First, get the maximal length on the total dataset to padd equally all of them
max_length = 0
for stc_encoded in train_encoded_data+dev_encoded_data+test_encoded_data:
    local_length = len(stc_encoded)
    if local_length > max_length:
        max_length = local_length
        
x_train = tf.keras.preprocessing.sequence.pad_sequences(train_encoded_data, maxlen=max_length)
x_dev = tf.keras.preprocessing.sequence.pad_sequences(dev_encoded_data, maxlen=max_length)
x_test = tf.keras.preprocessing.sequence.pad_sequences(test_encoded_data, maxlen=max_length)

## 4.2 - Design and train your model

In [200]:
# 3 - Design your encoder + classifier using tensorflow.keras.layers
#     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
#     Then we add components to this container : the lookup-table, the LSTM, the classifier etc.
#     All of these components are contained in the Sequential() and are trained together.
#     Note that the embedding layer is initialized randomly and does not take advantage of pre-trained word embeddings.

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = 50000  # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim)) # to adapt using our embedding
model.add(LSTM(nhid, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(n_classes, activation='sigmoid'))

In [201]:
# 4 - Define your loss/optimizer/metrics

loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          1600000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
Total params: 1,625,157
Trainable params: 1,625,157
Non-trainable params: 0
_________________________________________________________________
None


In [217]:
# 5 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set
#     Keras expects y_train and y_dev to be one-hot encodings of the labels, i.e. with shape=(n_samples, 5)

### Training set
int_train_labels = train_labels.astype('int')
y_train = np.zeros((int_train_labels.shape[0], n_classes))
for index, value in enumerate(int_train_labels):
    y_train[index,value] = 1
    
### Dev set
int_dev_labels = dev_labels.astype('int')
y_dev = np.zeros((int_dev_labels.shape[0], n_classes))
for index, value in enumerate(int_dev_labels):
    y_dev[index,value] = 1

bs = 64
n_epochs = 6

history = model.fit(x_train, y_train, batch_size=bs, epochs=n_epochs, validation_data=(x_dev, y_dev))

# Improvements : 
#   - Grid Search of parameters
#   - Attention mechanism ? (Transformers)
#   - Better Embedding

Train on 8544 samples, validate on 1101 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
# 6 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/
#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE

## 4.3 - innovate !

In [None]:
# 7 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..
#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.

# The longest length of the sentences is very low (49) : no need for a Transformer ?
# Using RNN should probably yield good results