# Kaggle Dataset downloading procedure

If you don't have glove word to vector representation download, then you can download it from kaggle. Follow some tutorial and you can find the procedure to do that. You can ignore this seesion if you have glove downloaded.

In [None]:
import os 
os.getcwd()

In [None]:
# create a dir first
!mkdir .kaggle

In [None]:
os.listdir()

In [None]:
import json
token = {"username":"steveabhishek","key":"784b75b23b20db654f0ebdffe71d14c1"}
with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)

In [None]:
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
!kaggle config set -n path -v{/content}

In [None]:
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
# download glove word to vec from kaggle
!kaggle datasets download -d rtatman/glove-global-vectors-for-word-representation

In [None]:
# Its a zip file so now to need to unzip it.
import zipfile

test_file_path = '/content/{/content}/datasets/rtatman/glove-global-vectors-for-word-representation/glove-global-vectors-for-word-representation.zip'


with zipfile.ZipFile(test_file_path,"r") as zip_ref:
    zip_ref.extractall("/content/sample_data/datasets")

In [None]:
# coping this file to gdrive
import shutil
file_path = r'/content/{/content}/datasets/rtatman/glove-global-vectors-for-word-representation/glove-global-vectors-for-word-representation.zip'
target_path = r'/content/gdrive/My Drive/Datasets/global-vectors-for-word-representation.zip'
shutil.copyfile(file_path, target_path)

# Now as every thing is set, we are ready to go!!

# Data Preparation
First we need to do some preparation: some of our models require the sentences to be tokenized, some do not. For that reason we'll make a simple Sentence class where we keep both the raw sentence and the tokenized sentence. The individual methods below will then pick the input they need.

# Ultimate Session

In [None]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

In [None]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


In [None]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/content/sample_data/datasets/glove.6B.200d.txt')

Lets test it now with a word 'cucumber'

In [None]:
word = "cucumber"
idx = 289846
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(idx) + "th word in the vocabulary is", index_to_word[idx])

In [None]:
 import nltk
 nltk.download('stopwords')


In [None]:
inverse_vocabulary = ['<unk>'] 
tokenizer = nltk.tokenize.TreebankWordTokenizer()
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    #print(X1_indices)
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =tokenizer.tokenize(X[i])
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            try:
              X_indices[i, j] = word_to_index[w]
            except:
              continue
            # Increment j to j + 1
            j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [None]:
X1 = np.array(["hey dude", "how are you", "how is it going"])
X1_indices = sentences_to_indices(X1,word_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =\n", X1_indices)

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    
    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_len,emb_dim,trainable = False)
    

    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

Lets test this function

In [None]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

In [None]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

**Read the data**

In [None]:
import csv
def read_csv(filename = '/content/train_command.csv'):
    phrase = []
    labels = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)

        for row in csvReader:
            phrase.append(row[0])
            labels.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(labels, dtype=int)

    return X, Y


In [None]:
X_train, Y_train = read_csv('train_command.csv')
X_test, Y_test= read_csv('validation command.csv')

In [None]:
for command in X_train:
  text_to_word_list(command)

In [None]:
maxLen = len(max(X_train, key=len).split())

In [None]:
print(maxLen)

In [None]:
def similarity_model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(input_shape,dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map,word_to_index)
    
    # Propagate sentence_indices through your embedding layer
    # (See additional hints in the instructions).
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(128,return_sequences =True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128,return_sequences = False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with 5 units
    X = Dense(units=15)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices,X)
    
    ### END CODE HERE ###
    
    return model

In [None]:
model = similarity_model((maxLen,), word_to_vec_map, word_to_index)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
'''
X_train_indices = []
for command in X_train:
  temp = sentences_to_indices(command,word_to_index,maxLen)
  X_train_indices.append(temp)'''

In [None]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)


In [None]:

def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y


In [None]:
Y_train_oh = convert_to_one_hot(Y_train,15)

In [None]:
X_train_indices.shape

In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 20, shuffle=True)

In [None]:
# Now test the model

X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)
Y_test_oh = convert_to_one_hot(Y_test, C = 15)


In [None]:
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)

In [None]:
sen = np.array(['tell me a jock dude can you'])
sen_idx = sentences_to_indices(sen,word_to_index,max_len = maxLen)


In [None]:
print(np.argmax(model.predict(sen_idx)))

In [None]:
model.save('/content/gdrive/MyDrive/Datasets/Ultimate_Dude_Model.h5')

# Great Job Dude!!

# Test area

In [None]:
from tensorflow.keras.models import load_model
import nltk
import numpy as np

ultimate_galvin_model = load_model('/content/gdrive/MyDrive/Datasets/Ultimate_Dude_Model.h5')

**prerequisites for prediction**

In [None]:
# Its a zip file so now to need to unzip it.
import zipfile

test_file_path = '/content/gdrive/MyDrive/Datasets/global-vectors-for-word-representation.zip'


with zipfile.ZipFile(test_file_path,"r") as zip_ref:
    zip_ref.extractall("/content/sample_data/datasets")

In [None]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


In [None]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/content/sample_data/datasets/glove.6B.200d.txt')

In [None]:

inverse_vocabulary = ['<unk>'] 
tokenizer = nltk.tokenize.TreebankWordTokenizer()
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    #print(X1_indices)
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =tokenizer.tokenize(X[i])
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            try:
              X_indices[i, j] = word_to_index[w]
            except:
              continue
            # Increment j to j + 1
            j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [None]:
maxLen = 11
test_sen = np.array(['hey whats your name dude'])
sen_idx = sentences_to_indices(test_sen,word_to_index,maxLen)

print(np.argmax(ultimate_galvin_model.predict(sen_idx)))

# That's it