## Functional API Tutorial

#### Day 1
Preliminary of the model. 
Different approaches

In [19]:
import numpy as np
import keras
import imdb_functions
from keras.layers import Input, Dense
from keras.models import Model # For functional API
from sklearn.feature_extraction.text import CountVectorizer

##### One other option to use keras is through TF
import tensorflow as tf
from tensorflow import keras #import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model # For functional API
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
def load_imdb(path, shuffle=True, random_state=42):
    import glob 
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    if shuffle:
        np.random.seed(random_state)
        indices = np.random.permutation(len(y_train))       
        
        X_train_corpus = [X_train_corpus[i] for i in indices]
        y_train = y_train[indices]
        
        indices = np.random.permutation(len(y_test))
        
        X_test_corpus = [X_test_corpus[i] for i in indices]
        y_test = y_test[indices]
        
    return X_train_corpus, y_train, X_test_corpus , y_test

In [21]:
path = r"/Users/ekremguzelyel/Desktop/Assignments/Research/aclImdb"
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb(path)

Loading the imdb data
Train Data loaded.
Test Data loaded.


In [22]:
print('len(X_train_corpus)', len(X_train_corpus),
      '\ny_train.shape', y_train.shape,
      '\nlen(X_test_corpus)', len(X_test_corpus),
      '\ny_test.shape', y_test.shape)

len(X_train_corpus) 25000 
y_train.shape (25000,) 
len(X_test_corpus) 25000 
y_test.shape (25000,)


### Vectorizer

In [23]:
token = r"(?u)\b[\w\'/]+\b"
# Use corpus here. Dk y.
#mind
vectorizer = CountVectorizer(token_pattern=token, min_df=5, stop_words=["the","a","of","and","br","to"])
X_train_vector = vectorizer.fit_transform(X_train_corpus)
X_test_vector = vectorizer.transform(X_test_corpus)

In [24]:
X_train_vector.shape

(25000, 28237)

In [25]:
input_nodes = Input(shape=(X_train_vector.shape[1],)) # Returns a Tensor

In [26]:
type(input_nodes)

tensorflow.python.framework.ops.Tensor

In [42]:
# a layer instance is callable on a tensor, and returns a tensor
x = Dense(30, activation='relu')(input_nodes)
x = Dense(30, activation='relu')(x)
output_nodes = Dense(1, activation='sigmoid')(input_nodes) ## it was (x) before

In [44]:
# This creates a model that includes
# the Input layer and three Dense layers
model = Model(inputs=input_nodes, outputs=output_nodes)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(X_train_vector, y_train, epochs=2)  # starts training

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1239b9be0>

In [45]:
loss, accuracy = model.evaluate(X_test_vector, y_test)



In [46]:
loss, accuracy

(0.2805835545945167, 0.89344)

In [47]:
score = model.evaluate(X_train_vector, y_train)



In [48]:
score

[0.14032445752620698, 0.96548]

__Question:__ The output of model.predict happened to be a numpy array. Why this wasn't tha case for logistic regression in SkLearn?

In [33]:
prediction_result= model.predict(X_test_vector, batch_size=None)

In [34]:
prediction_result[:50]

array([[0.47845063],
       [0.836907  ],
       [0.9691989 ],
       [0.01076878],
       [0.12572308],
       [0.15509939],
       [0.7812954 ],
       [0.93552536],
       [0.9683479 ],
       [0.864189  ],
       [0.68588114],
       [0.6666814 ],
       [0.43990278],
       [0.28491402],
       [0.85630757],
       [0.7985754 ],
       [0.01198623],
       [0.02258616],
       [0.6514368 ],
       [0.34247124],
       [0.7071451 ],
       [0.46450958],
       [0.03223292],
       [0.5372605 ],
       [0.00122444],
       [0.18969762],
       [0.9825509 ],
       [0.22159268],
       [0.37487566],
       [0.15257847],
       [0.03898124],
       [0.10919598],
       [0.47213268],
       [0.861729  ],
       [0.04137871],
       [0.08974812],
       [0.9277585 ],
       [0.8711819 ],
       [0.89770776],
       [0.00331046],
       [0.99497014],
       [0.872597  ],
       [0.07298311],
       [0.9538998 ],
       [0.8940372 ],
       [0.05027941],
       [0.07586839],
       [0.797

--------
__Task:__ Map prediction results with documents. Print out best n document. (Same as in IMDB_SkLearn)

In [35]:
len(prediction_result), X_train_vector.shape

(25000, (25000, 28237))

In [36]:
type(prediction_result)

numpy.ndarray

Turns out np.column_stack cannot be used, since we want to keep the pair
relationship between the vector and its prediction results.
Stacking of them will result adding prediction into vector as the last
element..

In [37]:
# This does the job, but find a better way to zip two numpy arrays.
vc_pred_pair = list(zip(X_train_vector.toarray(), prediction_result))

#np.column_stack((X_train_vector.toarray(), prediction_result))

In [38]:
vc_pred_pair

[(array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.47845063], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.836907], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.9691989], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.01076878], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.12572308], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.15509939], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.7812954], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.93552536], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.9683479], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.864189], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.68588114], dtype=float32)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0.666

In [39]:
vc_pred_pair_sorted = sorted(vc_pred_pair, key= lambda x:x[1], reverse=True)

#### All in one

In [40]:
import keras
import numpy as np
from keras.layers import Input, Dense
from keras.activations import sigmoid,relu
from keras.models import Model
from sklearn.feature_extraction.text import CountVectorizer

# X_train_vector , y_train, X_test_vector , y_test = imdb_functions.load_and_vectorize()
# import load_imdb() to your notebook
path = r"/Users/ekremguzelyel/Desktop/Assignments/Research/aclImdb"
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb(path)

token = r"(?u)\b[\w\'/]+\b"
vectorizer = CountVectorizer(token_pattern=token, min_df=5, stop_words=["the","a","of","and","br","to"])
X_train_vector = vectorizer.fit_transform(X_train_corpus)
X_test_vector = vectorizer.transform(X_test_corpus)

# Create tensor/model background.
input_nodes = Input(shape=(X_train_vector.shape[1],))
x = Dense(64, activation='relu', kernel_initializer='random_uniform')(input_nodes)
# x = Dense(64, activation='relu')(x)
output_nodes = Dense(1, activation='sigmoid')(x) ##  (input_nodes) if there's no hidden layer

# Create the model with layers specified above.
model = Model(inputs=input_nodes, outputs=output_nodes)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(X_train_vector, y_train, epochs=2)  # starts training

loss, accuracy = model.evaluate(X_test_vector, y_test)
print(loss, accuracy)
prediction_result= model.predict(X_test_vector, batch_size=None)

Loading the imdb data
Train Data loaded.
Test Data loaded.
Epoch 1/2
Epoch 2/2
0.3504314879465103 0.86964


In [41]:
loss, accuracy

(0.3504314879465103, 0.86964)