## Setup File for Keras Models
Use `%run Setup.ipynb` in another notebook to perform all these tasks automatically.

Parameters that can be re-configured:

In [3]:
MAX_NB_WORDS = 40000 # max no. of words for tokenizer 
MAX_SEQUENCE_LENGTH = 30 # max length of text (words) including padding \ 30 token aval ra dar nazar begir baghie ro dur beriz
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 200 # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "dataset/glove/glove.twitter.27B."+str(200)+"d.txt"
print("[i] Loaded Parameters:\n",
      MAX_NB_WORDS,MAX_SEQUENCE_LENGTH+4,
      VALIDATION_SPLIT,EMBEDDING_DIM,"\n",
      GLOVE_DIR)

[i] Loaded Parameters:
 40000 34 0.2 200 
 dataset/glove/glove.twitter.27B.200d.txt


Imports:

In [4]:
print("[i] Importing Modules...")
import numpy as np
import pandas as pd
import re, sys, os, csv, keras, pickle

[i] Importing Modules...


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras.utils import plot_model
print("[+] Using Keras version",keras.__version__)

[+] Using Keras version 2.2.2


In [6]:
print("[+] Finished Importing Modules")

[+] Finished Importing Modules


In [7]:
texts, labels = [], []
print("[i] Reading from csv file...", end="")
with open('data.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        texts.append(row[0])
        labels.append(row[1])
print("Done!")

[i] Reading from csv file...Done!


Convert text to word tokens (numbers that refer to the words)

In [8]:
"""
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("[i] Saved word tokenizer to file: tokenizer.pickle")
"""

'\ntokenizer = Tokenizer(num_words=MAX_NB_WORDS)\ntokenizer.fit_on_texts(texts)\nwith open(\'tokenizer.pickle\', \'wb\') as handle:\n    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\nprint("[i] Saved word tokenizer to file: tokenizer.pickle")\n'

In [9]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

Convert tweets to sequences of word tokens with zero padding at the front and back

In [10]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('[i] Found %s unique tokens.' % len(word_index))
#data_int = pad_sequences(sequences, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-5))
data_int = pad_sequences(sequences, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-4))
data = pad_sequences(data_int, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

[i] Found 34359 unique tokens.


In [11]:
labels = to_categorical(np.asarray(labels)) # convert to one-hot encoding vectors
print('[+] Shape of data tensor:', data.shape)
print('[+] Shape of label tensor:', labels.shape)

[+] Shape of data tensor: (47288, 30)
[+] Shape of label tensor: (47288, 5)


In [12]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [14]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('[i] Number of entries in each category:')
print("[+] Training:\n",y_train.sum(axis=0))
print("[+] Validation:\n",y_val.sum(axis=0))

[i] Number of entries in each category:
[+] Training:
 [ 7705. 13044. 12716.  3476.   890.]
[+] Validation:
 [1938. 3253. 3222.  825.  219.]


### Preparing the Embedding layer

Compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings.

We use pre-trained [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from Stanford NLP. For new words, a "randomised vector" will be created.
می خواهد دیکشنری از کلمه به بردار بسازد که بعدا از این دیکشنری برای ساخت ماتریس استفاده می کند 
در واقع می خواهد محتوای فایل متنی گلاو را خط به خط بخواند و برای هر خطی  کلمه ی مربوطه کلید عنصر در دیکشنری و بردار آن کلمه ولیو آن کلمه بشود ***********
بعد از ساخت دیکشنری می خواهیم ماتریس را بسازیم که سطرهای ماتریس کلمات و ستون ها به تعداد ابعاد، بردارهای کلمه هستند که به این ماتریس 
embedding matrix 
می گویند، دلیل ساخت این ماتریس این است که وزن لایه ی 
embedding 
را نشان می دهد و این ماتریس را درون لایه لود می کنیم و بعد فریز می کنیم تا 
train
نشود


In [15]:
embeddings_index = {} #Dictionary kalame ---> bordar 
f = open(GLOVE_DIR, encoding="utf8") 
print("[i] Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:              #iterate for each line 
    values = line.split()   #reshte haye yek matn ra ba space joda mikonad
    word = values[0]        #avalin reshte kalame ast!
    coefs = np.asarray(values[1:], dtype= 'float32')
    embeddings_index[word] = coefs
f.close()
print("Done.\n[+] Proceeding with Embedding Matrix...", end="")

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))#matrix khali misaze
for word, i in word_index.items():  #iteration ruye kalame va index ha , ye dune ye dune kalamat ra begir
    embedding_vector = embeddings_index.get(word)# vector haye kalamat ro estekhraj kon
    #embeddings_index = dictionary word--->bordar
    #word_index = dictionary word ---> index
    if embedding_vector is not None: #agar kalame dar glove vojud dashte bashad bordaresh ra bar migardanad agar nabashad none bar migardanad.
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector #agar none nist, bordar ra dar satre i matrix gharar midahad.
print("[i] Completed!")

[i] Loading GloVe from: dataset/glove/glove.twitter.27B.200d.txt ...Done.
[+] Proceeding with Embedding Matrix...[i] Completed!


In [16]:
print("[i] Finished running setup.")

[i] Finished running setup.
