<a href="https://colab.research.google.com/github/CianOSull/AutoML_With_SA_FYP_2021/blob/AutoKeras/MainNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generic Notebook for running all the libraries

How this notebook works is that it contains the code
for loading and cleaning the dataset.

Then there is multiple branches created on the
Github that include the code for running each library.

E.g. MLBox branch has the code for running MLBox.

# CURRENT BRANCH = AUTOKERAS

# Install the necessary library
Run the install code in the code cell below.

In [1]:
# Insert any install comamnds in this cell
!pip3 install autokeras

Collecting autokeras
[?25l  Downloading https://files.pythonhosted.org/packages/09/12/cf698586ccc8245f08d1843dcafb65b064a2e9e2923b889dc58e1019f099/autokeras-1.0.12-py3-none-any.whl (164kB)
[K     |██                              | 10kB 13.7MB/s eta 0:00:01[K     |████                            | 20kB 13.9MB/s eta 0:00:01[K     |██████                          | 30kB 9.8MB/s eta 0:00:01[K     |████████                        | 40kB 8.6MB/s eta 0:00:01[K     |██████████                      | 51kB 5.5MB/s eta 0:00:01[K     |████████████                    | 61kB 5.7MB/s eta 0:00:01[K     |██████████████                  | 71kB 6.1MB/s eta 0:00:01[K     |████████████████                | 81kB 6.5MB/s eta 0:00:01[K     |██████████████████              | 92kB 5.8MB/s eta 0:00:01[K     |████████████████████            | 102kB 5.3MB/s eta 0:00:01[K     |██████████████████████          | 112kB 5.3MB/s eta 0:00:01[K     |████████████████████████        | 122kB 5.3MB/s 

# Preprocessing Section

In [2]:
# Import the necessary modules for cleaning
import math
import time 
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [3]:
# Create the set of stopwords for cleaning text
stopwords = set(w.rstrip() for w in open('/content/drive/MyDrive/CIT/FYP/ImplementationFiles/stopwords.txt'))

In [4]:
# Download the necessary parts for the NLTK module
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
# This funciton handles celaning text
def clean_text(text):
    # Create the lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # Get rid of non alpha characters except "'" as it is needed for the lemment
    text = "".join(c for c in text if c.isalnum() or c == " " or "'")
    
    # Get rid of capitals
    text = text.lower()
    
    # Tokenize the words    
    # Create tokens of each word
    token_text = word_tokenize(text)
    
    # Get rid of any piece of text that isn't over 2 characters
    token_text = [t for t in token_text if len(t) > 2] 
    
    # Put words in base form by doing lemmatization
    token_text = [wordnet_lemmatizer.lemmatize(t) for t in token_text]

    # Remove stopwords
    token_text = [t for t in token_text if t not in stopwords]
    
    # Return the tokens
    return token_text

In [6]:
# This function will get the term frequencies for word in the review
# TF = Term I frequency in document/total words in document
def calc_tf(term_count, review_corpus):
    # A dictionary of all the term frequencies found
    tf_freq = dict.fromkeys(term_count.keys(), 0)   
    
    # Review corpus is a tokenized list so the total words iteh length
    total_words = len(review_corpus)
    
    # Calculate the term frequency for each word
    for word, count in term_count.items():
        tf_freq[word] = count/total_words
        
    return tf_freq

In [7]:
# This calcualtes the idf
# IDF = log(2)*(Total number of Documents/documents frequency or documents with term)
def calc_idf(unique_terms, list_doc_terms):   
    # A dicitonary of all the inverse document frequencies
    idf = dict.fromkeys(unique_terms, 0)
    
    # Basically list_doc_terms has all the documents with the term count for each word
    # You go through each document count the terms where they occured
    for doc_terms in list_doc_terms:  
        # This for loop is counting the amount of document a word was in
        for word, value in doc_terms.items():
            if 0 < value:
                idf[word] += 1
        
    # Now we calculate idf
    for word, value in idf.items():
        idf[word] = math.log10(10 / float(value))
    
    return idf

In [8]:
# Modified this function to return a list as dictionaries arn't needed anymore
def calc_tf_idf(tf, idf, n_terms):
    # Create an array that is of length of the number of unique terms
    tf_idf_array = np.zeros(n_terms)
    
    for index, (word, value) in enumerate(tf.items()):
        # Add the tfidf to the array
        tf_idf_array[index] = value*idf[word]
    
    return tf_idf_array

In [9]:
def process_text(text_data):
    # A list of all the cleaned reviews
    doc_list = []
    
    # List of all the unique terms
    unique_terms = []
    
    # A list of all the term frequencies
    tf_list = []
    
    for review in text_data:
        # First clean the review
        clean_review = clean_text(review)
        
        # Keeps track of the term counts for each word
        count_dict = {}
        
        # Now lets find the total count for each word
        for token in clean_review:
            if token not in count_dict:
                count_dict[token] = 1
            else:
                count_dict[token] += 1
        
        # Caclulate the term frequencies for each document
        tf_list.append(calc_tf(count_dict, clean_review))
        
        # Then add the dictionary of counts for each document to the list
        doc_list.append(count_dict)
        
        # Then add the new unique terms
        unique_terms = set(unique_terms).union(set(clean_review))
    
    # Calculate the inverse document frequency value
    idf = calc_idf(unique_terms, doc_list)
    
    # This array will contain the tfidf values for each term in each review
    tfidf_values = np.zeros((len(tf_list), len(unique_terms)))
    
    # Now we can get the TFIDF for each document
    for index, term_freq in enumerate(tf_list):
        # This will return an array of the tfidf values calculated.
        # The length of the unique terms list is passed in so that the 
        # Array that is returned matches the tfidf array
        tf_idf_array = calc_tf_idf(term_freq, idf, len(unique_terms))
        # Add this to the overall tfidf values calculated
        tfidf_values[index,:] = tf_idf_array
    
    return tfidf_values

In [10]:
# Prepare the data
def prepare_data():
    print("="*50)

    # Load the dataset
    # imdb_df = pd.read_csv("IMDB Dataset.csv")
    imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")
    print("Dataset loaded")
    print("="*50)

    
    # Change each positive and negative value to 1 and 0 respectively    
    imdb_df['sentiment'] = imdb_df['sentiment'].map({'negative' : 0, 'positive' : 1})
    
    # For testing, a much smaller dataset is going to be used
    imdb_df = imdb_df.head(5000)

    # Group all the negative reviews and get the first 2500
    imdb_df_neg = (imdb_df[imdb_df['sentiment'] == 0])[0:2500]
    # Group all the positive and get the first 2500
    imdb_df_pos = imdb_df[imdb_df['sentiment'] == 1]
    
    test_df = pd.concat([imdb_df_neg, imdb_df_pos]) 
    # print(test_df)
    
    # .values on a column of a dataframe returns a numpy array
    # This is a numpy array of all the reviews
    # initial_reviews = imdb_df['review'].values
    initial_reviews = test_df['review'].values
    
    # This is a numpy array of all the positive and negativelabels
    # labels = imdb_df['sentiment'].values
    labels = test_df['sentiment'].values
    
    print("Creating Feature Vector")
    print("="*50)
    start = time.time()
    # Process the text data and create teh feature vector
    feature_vector = process_text(initial_reviews)
    end = time.time()
    print("Feature Vector Created")
    print(f"Execution time is {end - start} secs")
    print("="*50)
    
    # Shuffle the labesl and feature vector using sklearn shuffle
    feature_vector, labels = shuffle(feature_vector, labels)
    
    # Creating train and test data
    # The splits will be 80:20 
    no_samples = 0.8
    
    # This gets the percentage of indexes from feature vector and uses those for training
    X_train = feature_vector[0:int(no_samples*len(feature_vector))]
    y_train = labels[0:int(no_samples*len(labels))]
    
    # Go from the index that was used for training to the final
    X_test = feature_vector[int(no_samples*len(feature_vector)):len(feature_vector)]
    y_test = labels[int(no_samples*len(labels)):len(labels)]

    return X_train, y_train, X_test, y_test

# Create Model Section 

**Exporting models:**

https://autokeras.com/tutorial/export/

Auto Keras has a built in text classifier that takes in strings so going to test using that?



In [11]:
import autokeras as ak

In [12]:
# Load the dataset
imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")
# print(imdb_df.head())

# For testing, a much smaller dataset is going to be used
# imdb_df = imdb_df.head(4000)
# print(imdb_df.head())

# Group all the negative reviews and get the first 25000
imdb_df_neg = (imdb_df[imdb_df['sentiment'] == "negative"])[0:2500]
# Group all the positive and get the first 25000
imdb_df_pos = imdb_df[imdb_df['sentiment'] == "positive"][0:2500]

# Combine the two split positives and negatives into one dataframe
imdb_df = pd.concat([imdb_df_neg, imdb_df_pos]) 

# .values on a column of a dataframe returns a numpy array
# This is a numpy array of all the reviews
# initial_reviews = imdb_df['review'].values
feature_vector = imdb_df['review'].values
# print(feature_vector)

# This is a numpy array of all the positive and negativelabels
labels = imdb_df['sentiment'].values

# Shuffle the labesl and feature vector using sklearn shuffle
feature_vector, labels = shuffle(feature_vector, labels)

# Creating train and test data
# The splits will be 80:20 
no_samples = 0.8

# This gets the percentage of indexes from feature vector and uses those for training
X_train = feature_vector[0:int(no_samples*len(feature_vector))]
y_train = labels[0:int(no_samples*len(labels))]

# Go from the index that was used for training to the final
X_test = feature_vector[int(no_samples*len(feature_vector)):len(feature_vector)]
y_test = labels[int(no_samples*len(labels)):len(labels)]

print(X_train.shape)
print(type(X_train))

(4000,)
<class 'numpy.ndarray'>


In [None]:
# Create the text classifier
# It is only going to do 1 model but change max trials for more
# Has a built in project name, default is text_classifier
# Trials is the amount of keras models to use, default is 1000 and might stop before reaching that number
# Directory sets where search outputs will be saved
# Objective is what it needs to prioritise, default is val_loss but val_accuracy can be used
# Naming convention = ak_epochs_dataNo_trials
clf = ak.TextClassifier(overwrite=True, project_name="ak_100_5000_2", max_trials=2, directory="/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/SearchOuputs", objective="val_loss")

In [None]:
# Call the fit function and train the models for only 2 epochs
# Leave empty for adaptive epochs
# By default, auto keras will use the last 20% of training data for validation
# Setting validation to 0.2 in this case to just to demonstrate
print("Creating Fit the model")
print("="*50)
start = time.time()
# Default epochs is 1000
clf.fit(X_train, y_train, validation_split=0.2, epochs=100)
end = time.time()
print("Model Fitting Finished")
print(f"Execution time is {end - start} secs")
print("="*50)

In [None]:
# Create some predictions
predicted_y = clf.predict(X_test)

# Evaluate the model used
print(clf.evaluate(X_test, y_test))

In [None]:
best_model = clf.export_model()

# Naming convenction = ModelAutoKeras_epochs_dataNo_trials
try:
    best_model.save("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/Models/mak_100_5000_2", save_format="tf")
except:
    best_model.save("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/Models/mak_100_5000_2.h5")

In [None]:
# Test loading a model
import tensorflow as tf
import tensorflow.keras

In [29]:
loaded_model = load_model("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/Models/mak_100_5000_2", custom_objects=ak.CUSTOM_OBJECTS)



In [None]:
predicted_y = loaded_model.predict(tf.expand_dims(X_test, -1))
print(predicted_y[:5])

In [31]:
print(loaded_model.predict(X_test)[:5])

[[1.0000000e+00]
 [4.3246618e-20]
 [3.5085404e-10]
 [1.0000000e+00]
 [2.4857304e-11]]


In [40]:
for i, v in enumerate(y_test):
  if y_test[i] == "positive":
    y_test[i] = 1.0
  else:
    y_test[i] = 0.0

In [41]:
print(len(X_test))
print(type(y_test))
print(y_test[:5])
score = loaded_model.evaluate(X_test, y_test, verbose=0)

1000
<class 'numpy.ndarray'>
[0.0 0.0 0.0 0.0 0.0]


ValueError: ignored

# Old Funciton Section

Now split into individual cells for ram testing

In [None]:

def main():
  # Since this is deep learning this part isn't needed
  # X_train, y_train, X_test, y_test = prepare_data()

  # print(len(X_train))

  #==========================================
  # Insert the code for running the libraries in here
  import autokeras as ak

  # Load the dataset
  imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")
  # print(imdb_df.head())

  # For testing, a much smaller dataset is going to be used
  # imdb_df = imdb_df.head(4000)
  # print(imdb_df.head())

  # Group all the negative reviews and get the first 25000
  imdb_df_neg = (imdb_df[imdb_df['sentiment'] == "negative"])[0:2500]
  # Group all the positive and get the first 25000
  imdb_df_pos = imdb_df[imdb_df['sentiment'] == "positive"][0:2500]
  
  # Combine the two split positives and negatives into one dataframe
  imdb_df = pd.concat([imdb_df_neg, imdb_df_pos]) 

  # .values on a column of a dataframe returns a numpy array
  # This is a numpy array of all the reviews
  # initial_reviews = imdb_df['review'].values
  feature_vector = imdb_df['review'].values
  # print(feature_vector)

  # This is a numpy array of all the positive and negativelabels
  labels = imdb_df['sentiment'].values

  # Shuffle the labesl and feature vector using sklearn shuffle
  feature_vector, labels = shuffle(feature_vector, labels)

  # Creating train and test data
  # The splits will be 80:20 
  no_samples = 0.8

  # This gets the percentage of indexes from feature vector and uses those for training
  X_train = feature_vector[0:int(no_samples*len(feature_vector))]
  y_train = labels[0:int(no_samples*len(labels))]

  # Go from the index that was used for training to the final
  X_test = feature_vector[int(no_samples*len(feature_vector)):len(feature_vector)]
  y_test = labels[int(no_samples*len(labels)):len(labels)]

  print(X_train.shape)
  print(type(X_train))

  # Create the text classifier
  # It is only going to do 1 model but change max trials for more
  # Has a built in project name, default is text_classifier
  # Trials is the amount of keras models to use, default is 1000 and might stop before reaching that number
  # Directory sets where search outputs will be saved
  # Objective is what it needs to prioritise, default is val_loss but val_accuracy can be used
  # Naming convention = ak_epochs_dataNo_trials
  clf = ak.TextClassifier(overwrite=True, project_name="ak_100_5000_2", max_trials=2, directory="/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/SearchOuputs", objective="val_loss")

  # Call the fit function and train the models for only 2 epochs
  # Leave empty for adaptive epochs
  # By default, auto keras will use the last 20% of training data for validation
  # Setting validation to 0.2 in this case to just to demonstrate
  print("Creating Fit the model")
  print("="*50)
  start = time.time()
  # Default epochs is 1000
  clf.fit(X_train, y_train, validation_split=0.2, epochs=100)
  end = time.time()
  print("Model Fitting Finished")
  print(f"Execution time is {end - start} secs")
  print("="*50)

  # Create some predictions
  predicted_y = clf.predict(X_test)

  # Evaluate the model used
  print(clf.evaluate(X_test, y_test))

  # Save the model:
  best_model = clf.export_model()

  # Naming convenction = ModelAutoKeras_epochs_dataNo_trials
  try:
      best_model.save("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/Models/mak_100_5000_2", save_format="tf")
  except:
      best_model.save("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/Models/mak_100_5000_2.h5")

  #==========================================

main()

Trial 2 Complete [00h 02m 06s]
val_loss: 0.4370107650756836

Best val_loss So Far: 0.3781603276729584
Total elapsed time: 00h 07m 23s
INFO:tensorflow:Oracle triggered exit
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch

First sucesssful output:
Trial 2 Complete [00h 18m 08s]
val_loss: 0.33026546239852905

Best val_loss So Far: 0.26305750012397766
Total elapsed time: 01h 01m 52s
INFO:tensorflow:Oracle triggered exit
Epoch 1/20
1250/1250 [==============================] - 257s 205ms/step - loss: 0.5032 - accuracy: 0.7166
Epoch 2/20
1250/1250 [==============================] - 258s 206ms/step - loss: 0.2453 - accuracy: 0.8989
Epoch 3/20
1250/1250 [==============================] - 257s 206ms/step - loss: 0.1807 - accuracy: 0.9318
Epoch 4/20
1250/1250 [==============================] - 257s 206ms/step - loss: 0.1339 - accuracy: 0.9517
Epoch 5/20
1250/1250 [==============================] - 257s 206ms/step - loss: 0.1052 - accuracy: 0.9628
Epoch 6/20
1250/1250 [==============================] - 258s 206ms/step - loss: 0.0887 - accuracy: 0.9683
Epoch 7/20
1250/1250 [==============================] - 258s 206ms/step - loss: 0.0741 - accuracy: 0.9731
Epoch 8/20
1250/1250 [==============================] - 258s 206ms/step - loss: 0.0560 - accuracy: 0.9802
Epoch 9/20
1250/1250 [==============================] - 261s 208ms/step - loss: 0.0481 - accuracy: 0.9829
Epoch 10/20
1250/1250 [==============================] - 259s 207ms/step - loss: 0.0449 - accuracy: 0.9835
Epoch 11/20
1250/1250 [==============================] - 260s 208ms/step - loss: 0.0448 - accuracy: 0.9833
Epoch 12/20
1250/1250 [==============================] - 261s 209ms/step - loss: 0.0401 - accuracy: 0.9852
Epoch 13/20
1250/1250 [==============================] - 261s 209ms/step - loss: 0.0390 - accuracy: 0.9861
Epoch 14/20
1250/1250 [==============================] - 260s 208ms/step - loss: 0.0326 - accuracy: 0.9879
Epoch 15/20
1250/1250 [==============================] - 257s 206ms/step - loss: 0.0329 - accuracy: 0.9889
Epoch 16/20
1250/1250 [==============================] - 259s 207ms/step - loss: 0.0308 - accuracy: 0.9886
Epoch 17/20
1250/1250 [==============================] - 260s 208ms/step - loss: 0.0301 - accuracy: 0.9880
Epoch 18/20
1250/1250 [==============================] - 259s 207ms/step - loss: 0.0253 - accuracy: 0.9912
Epoch 19/20
1250/1250 [==============================] - 260s 208ms/step - loss: 0.0210 - accuracy: 0.9923
Epoch 20/20
1250/1250 [==============================] - 259s 207ms/step - loss: 0.0258 - accuracy: 0.9908
INFO:tensorflow:Assets written to: /content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/AutoKeras/SearchOuputs/ak_20_50000_2/best_model/assets
Model Fitting Finished
Execution time is 8901.862411737442 secs
==================================================
313/313 [==============================] - 18s 56ms/step - loss: 0.6138 - accuracy: 0.8907
[0.6138280630111694, 0.8906999826431274]

In [None]:
# import os
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras.datasets import imdb
# from sklearn.datasets import load_files

# dataset = tf.keras.utils.get_file(
#     fname="aclImdb.tar.gz",
#     origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
#     extract=True,
# )

# # set path to dataset
# IMDB_DATADIR = os.path.join(os.path.dirname(dataset), 'aclImdb')

# classes = ['pos', 'neg']
# train_data = load_files(os.path.join(IMDB_DATADIR, 'train'), shuffle=True, categories=classes)
# test_data = load_files(os.path.join(IMDB_DATADIR,  'test'), shuffle=False, categories=classes)

# x_train = np.array(train_data.data)
# y_train = np.array(train_data.target)
# x_test = np.array(test_data.data)
# y_test = np.array(test_data.target)

# print(x_train.shape)  # (25000,)
# print(y_train.shape)  # (25000, 1)
# print(x_train[0][:50])  # this film was just brilliant casting


In [None]:
# print(train_data.data[0])