<a href="https://colab.research.google.com/github/CianOSull/AutoML_With_SA_FYP_2021/blob/H2O/MainNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generic Notebook for running all the libraries

How this notebook works is that it contains the code
for loading and cleaning the dataset.

Then there is multiple branches created on the
Github that include the code for running each library.

E.g. MLBox branch has the code for running MLBox.

# CURRENT BRANCH: H2O

# Install Library

In [1]:
# Insert any install comamnds in this cell
!apt-get install default-jre
!java -version
!pip install h2o

Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jre is already the newest version (2:1.11-68ubuntu1~18.04.1).
default-jre set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 13 not upgraded.
openjdk version "11.0.10" 2021-01-19
OpenJDK Runtime Environment (build 11.0.10+9-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.10+9-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/d4/d4/5c07504a392e94786e7cf33554d961ac4b2863aa22a07b8579940ea1f6b5/h2o-3.32.0.4.tar.gz (164.6MB)
[K     |████████████████████████████████| 164.6MB 55kB/s 
Collecting colorama>=0.3.8
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o:

# Preprocessing Section

In [2]:
# Import the necessary modules for cleaning
import math
import time 
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [3]:
# Create the set of stopwords for cleaning text
stopwords = set(w.rstrip() for w in open('/content/drive/MyDrive/CIT/FYP/ImplementationFiles/stopwords.txt'))

In [4]:
# Download the necessary parts for the NLTK module
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
# This funciton handles celaning text
def clean_text(text):
    # Create the lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # Get rid of non alpha characters except "'" as it is needed for the lemment
    text = "".join(c for c in text if c.isalnum() or c == " " or "'")
    
    # Get rid of capitals
    text = text.lower()
    
    # Tokenize the words    
    # Create tokens of each word
    token_text = word_tokenize(text)
    
    # Get rid of any piece of text that isn't over 2 characters
    token_text = [t for t in token_text if len(t) > 2] 
    
    # Put words in base form by doing lemmatization
    token_text = [wordnet_lemmatizer.lemmatize(t) for t in token_text]

    # Remove stopwords
    token_text = [t for t in token_text if t not in stopwords]
    
    # Return the tokens
    return token_text

In [6]:
# This function will get the term frequencies for word in the review
# TF = Term I frequency in document/total words in document
def calc_tf(term_count, review_corpus):
    # A dictionary of all the term frequencies found
    tf_freq = dict.fromkeys(term_count.keys(), 0)   
    
    # Review corpus is a tokenized list so the total words iteh length
    total_words = len(review_corpus)
    
    # Calculate the term frequency for each word
    for word, count in term_count.items():
        tf_freq[word] = count/total_words
        
    return tf_freq

In [7]:
# This calcualtes the idf
# IDF = log(2)*(Total number of Documents/documents frequency or documents with term)
def calc_idf(unique_terms, list_doc_terms):   
    # A dicitonary of all the inverse document frequencies
    idf = dict.fromkeys(unique_terms, 0)
    
    # Basically list_doc_terms has all the documents with the term count for each word
    # You go through each document count the terms where they occured
    for doc_terms in list_doc_terms:  
        # This for loop is counting the amount of document a word was in
        for word, value in doc_terms.items():
            if 0 < value:
                idf[word] += 1
        
    # Now we calculate idf
    for word, value in idf.items():
        idf[word] = math.log10(10 / float(value))
    
    return idf

In [8]:
# Modified this function to return a list as dictionaries arn't needed anymore
def calc_tf_idf(tf, idf, n_terms):
    # Create an array that is of length of the number of unique terms
    tf_idf_array = np.zeros(n_terms)
    
    for index, (word, value) in enumerate(tf.items()):
        # Add the tfidf to the array
        tf_idf_array[index] = value*idf[word]
    
    return tf_idf_array

In [9]:
def process_text(text_data):
    # A list of all the cleaned reviews
    doc_list = []
    
    # List of all the unique terms
    unique_terms = []
    
    # A list of all the term frequencies
    tf_list = []
    
    for review in text_data:
        # First clean the review
        clean_review = clean_text(review)
        
        # Keeps track of the term counts for each word
        count_dict = {}
        
        # Now lets find the total count for each word
        for token in clean_review:
            if token not in count_dict:
                count_dict[token] = 1
            else:
                count_dict[token] += 1
        
        # Caclulate the term frequencies for each document
        tf_list.append(calc_tf(count_dict, clean_review))
        
        # Then add the dictionary of counts for each document to the list
        doc_list.append(count_dict)
        
        # Then add the new unique terms
        unique_terms = set(unique_terms).union(set(clean_review))
    
    # Calculate the inverse document frequency value
    idf = calc_idf(unique_terms, doc_list)
    
    # This array will contain the tfidf values for each term in each review
    tfidf_values = np.zeros((len(tf_list), len(unique_terms)))
    
    # Now we can get the TFIDF for each document
    for index, term_freq in enumerate(tf_list):
        # This will return an array of the tfidf values calculated.
        # The length of the unique terms list is passed in so that the 
        # Array that is returned matches the tfidf array
        tf_idf_array = calc_tf_idf(term_freq, idf, len(unique_terms))
        # Add this to the overall tfidf values calculated
        tfidf_values[index,:] = tf_idf_array
    
    return tfidf_values

In [10]:
# Prepare the data
def prepare_data(num):
    print("="*50)

    # Load the dataset
    # imdb_df = pd.read_csv("IMDB Dataset.csv")
    imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")
    print("Dataset loaded")
    print("="*50)

    
     # Change each positive and negative value to 1 and 0 respectively    
    # imdb_df['sentiment'] = imdb_df['sentiment'].map({'negative' : 0, 'positive' : 1})
    
    # For testing, a much smaller dataset is going to be used
    # imdb_df = imdb_df.head(25000)

    # Group all the negative reviews and get the first 2500
    # imdb_df_neg = (imdb_df[imdb_df['sentiment'] == 0])[0:num]
    imdb_df_neg = (imdb_df[imdb_df['sentiment'] == "negative"])[0:num]
    
    # Group all the positive and get the first 2500
    # imdb_df_pos = imdb_df[imdb_df['sentiment'] == 1][0:num]
    imdb_df_pos = imdb_df[imdb_df['sentiment'] == "positive"][0:num]
    
    # Combine the two split positives and negatives into one dataframe
    imdb_df = pd.concat([imdb_df_neg, imdb_df_pos]) 
    # print(test_df)
    
    # .values on a column of a dataframe returns a numpy array
    # This is a numpy array of all the reviews
    # initial_reviews = imdb_df['review'].values
    initial_reviews = imdb_df['review'].values
    
    # This is a numpy array of all the positive and negativelabels
    # labels = imdb_df['sentiment'].values
    labels = imdb_df['sentiment'].values
    
    print("Creating Feature Vector")
    print("="*50)
    start = time.time()
    # Process the text data and create teh feature vector
    feature_vector = process_text(initial_reviews)
    end = time.time()
    print("Feature Vector Created")
    print(len(feature_vector))
    print(f"Execution time is {end - start} secs")
    print("="*50)
    
    # Shuffle the labesl and feature vector using sklearn shuffle
    feature_vector, labels = shuffle(feature_vector, labels)
    
    return feature_vector, labels

    # # Creating train and test data
    # # The splits will be 80:20 
    # no_samples = 0.8
    
    # # This gets the percentage of indexes from feature vector and uses those for training
    # X_train = feature_vector[0:int(no_samples*len(feature_vector))]
    # y_train = labels[0:int(no_samples*len(labels))]
    
    # # Go from the index that was used for training to the final
    # X_test = feature_vector[int(no_samples*len(feature_vector)):len(feature_vector)]
    # y_test = labels[int(no_samples*len(labels)):len(labels)]

    # return X_train, y_train, X_test, y_test

# Create Model Section

**Documentation on manipulating data for h2o**

http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging.html

http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html#automl-interface

# Youtube tutorial
https://github.com/srivatsan88/YouTubeLI/blob/master/H2O_AutoML.ipynb

**BIG NOTE**

It seems because of how much H2O automates everything, it only takes in files. Thus a possible solution for running it on the tf idf values is to make a pandas dataframe using the numpy values, then convert it to a csv file and then feed it into the h2o.

There is also h2o.sklearn which allows h2o to work with sklearn and does allow it to work with numpy so maybe test that out.

In [11]:
# Start up h2o
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.10" 2021-01-19; OpenJDK Runtime Environment (build 11.0.10+9-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.10+9-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpf8w6i7_t
  JVM stdout: /tmp/tmpf8w6i7_t/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpf8w6i7_t/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.4
H2O_cluster_version_age:,23 days
H2O_cluster_name:,H2O_from_python_unknownUser_j3xh90
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.180 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [12]:
# # Load the dataset
# # imdb_df = pd.read_csv("IMDB Dataset.csv")
# imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")
# print("Dataset loaded")
# print("="*50)

# # Change each positive and negative value to 1 and 0 respectively    
# # imdb_df['sentiment'] = imdb_df['sentiment'].map({'negative' : 0, 'positive' : 1})

# # For testing, a much smaller dataset is going to be used
# # imdb_df = imdb_df.head(25000)

# # Group all the negative reviews and get the first 2500
# # imdb_df_neg = (imdb_df[imdb_df['sentiment'] == 0])[0:100]
# imdb_df_neg = (imdb_df[imdb_df['sentiment'] == "negative"])[0:100]

# # Group all the positive and get the first 2500
# # imdb_df_pos = imdb_df[imdb_df['sentiment'] == 1][0:100]
# imdb_df_pos = imdb_df[imdb_df['sentiment'] == "positive"][0:100]

# # Combine the two split positives and negatives into one dataframe
# imdb_df = pd.concat([imdb_df_neg, imdb_df_pos]) 

# print(len(imdb_df))
# print("="*50)
# # Shuffle dataframe
# print(len(imdb_df.sample(frac=1)))

# print(imdb_df.columns.to_list())
# print(type(imdb_df.columns.to_list()))
# h2o_imdb_df = h2o.H2OFrame(imdb_df)
# h2o_imdb_df.describe()

In [13]:
# Create a train, test and validation split
# This will create a train split of 70% and test and validation split of 15% each
# imdb_train, imdb_test, imdb_valid = h2o_imdb_df.split_frame(ratios=[0.7, 0.15])

In [14]:
# X_train, y_train, X_test, y_test = prepare_data()
# num = 100 worked
feature_vector, labels = prepare_data(500)

# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
tfidf_tf = pd.DataFrame(feature_vector)

# Add the labels
tfidf_tf['labels'] = labels

#==========================================
# Insert the code for running the libraries in here
h2o_tfidf = h2o.H2OFrame(tfidf_tf)
# Set labels to be a categorical field
h2o_tfidf['labels'] = h2o_tfidf['labels'].asfactor()
# print(h2o_tfidf.head())

Dataset loaded
Creating Feature Vector
Feature Vector Created
1000
Execution time is 5.413161039352417 secs
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [15]:
y = "labels"
x = h2o_tfidf.columns
x.remove(y)

h2o_tfidf_train, h2o_tfidf_test, h2o_tfidf_valid = h2o_tfidf.split_frame(ratios=[0.7, 0.15])

# exceptions_split = []

# for i in range(3):
#   # if (!success):
#   try:   
#     h2o_tfidf_train, h2o_tfidf_test, h2o_tfidf_valid = h2o_tfidf.split_frame(ratios=[0.7, 0.15])

#     # Break is here because if it doesnt finish with an error, the models are done    
#     break
#     # success = True
#   except Exception as e:
#     # success = False
#     exceptions.append(e)

# print(len(exceptions_model))

print("Splits created")

Splits created


In [16]:
from h2o.automl import H2OAutoML

# This is using 10 max models, so it will only create 10
# You can also tell it to not try to use stacked ensemble.
# In this initial test stacked ensemble will be removed to keep it simple
# though deep learning will be allowed since it is good on text data
# You can also set timeboxes for how long models can run
# Verbosity sets whetehr it gives out information while running
# nfolds is for validation, by default it is set to 5 and will create validation splits
# set nfolds to 0 if you have created your own validation split
# Model names go like this h2o_datasize_MaxModels_Time_Seed
h2o_model = H2OAutoML(max_models = 10, seed = 5, exclude_algos = ["StackedEnsemble", "DeepLearning"], verbosity="info", nfolds=0, project_name="h2o_1000_10_na_5")

In [18]:
exceptions_model = []

# success = False

# The server seems to fail sometiems but saw a stack overflow which I think
# said that by running it 3 times, it just works
# Tried it once and now and succeded but then ran again so set it to only run once
for i in range(3):
  # if (!success):
  try:   
    h2o_model.train(x = x, y = y, training_frame = h2o_tfidf_train, validation_frame=h2o_tfidf_valid)

    # Break is here because if it doesnt finish with an error, the models are done    
    break
    # success = True
  except Exception as e:
    # success = False
    exceptions_model.append(e)

print(len(exceptions_model))
# print(exceptions_model[0])

AutoML progress: |
22:18:44.533: Project: h2o_1000_10_na_5
22:18:44.537: Cross-validation disabled by user: no fold column nor nfolds > 1.
22:18:44.557: Setting stopping tolerance adaptively based on the training frame: 0.037582301400141446
22:18:44.558: Build control seed: 5
22:18:44.558: Since cross-validation is disabled, and no leaderboard frame was provided, automatically split the training data into training and leaderboard frames in the ratio 90/10
22:19:00.463: training frame: Frame key: automl_training_py_3_sid_8a30    cols: 18198    rows: 631  chunks: 18    size: 26768161  checksum: 343775478583807757
22:19:00.530: validation frame: Frame key: py_5_sid_8a30    cols: 18198    rows: 141  chunks: 18    size: 26320998  checksum: 4702055585537513231
22:19:03.355: leaderboard frame: Frame key: automl_leaderboard_py_3_sid_8a30    cols: 18198    rows: 77  chunks: 18    size: 26271768  checksum: 3592934798136410200
22:19:03.355: blending frame: NULL
22:19:03.355: response column: labe

In [20]:
lb = h2o_model.leaderboard
  
lb.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_2_AutoML_20210224_221844,0.573187,0.775198,0.581637,0.452804,0.523931,0.274503
XGBoost_2_AutoML_20210224_222031,0.573187,0.775198,0.581637,0.452804,0.523931,0.274503
XGBoost_1_AutoML_20210224_222031,0.550616,0.804332,0.582954,0.402531,0.532957,0.284043
XGBoost_1_AutoML_20210224_221844,0.550616,0.804332,0.582954,0.402531,0.532957,0.284043
GBM_1_AutoML_20210224_221844,0.52052,0.713749,0.584197,0.455882,0.510511,0.260621
GBM_1_AutoML_20210224_222031,0.52052,0.713749,0.584197,0.455882,0.510511,0.260621
GBM_3_AutoML_20210224_221844,0.519836,0.726438,0.536688,0.423393,0.515188,0.265419
GBM_3_AutoML_20210224_222031,0.519836,0.726438,0.536688,0.423393,0.515188,0.265419
GBM_5_AutoML_20210224_221844,0.5171,0.70004,0.547834,0.482216,0.503217,0.253227
GBM_5_AutoML_20210224_222031,0.5171,0.70004,0.547834,0.482216,0.503217,0.253227




In [22]:
# Using the best model make predictions
h2o_tfidf_pred = h2o_model.leader.predict(h2o_tfidf_test)

xgboost prediction progress: |████████████████████████████████████████████| 100%


In [23]:
# First 10 predictions
h2o_tfidf_pred.head()
# Shows the probabilities for each one

predict,negative,positive
positive,0.871038,0.128962
positive,0.81946,0.18054
positive,0.531022,0.468978
positive,0.57719,0.42281
positive,0.727484,0.272516
positive,0.733237,0.266763
positive,0.352137,0.647863
positive,0.708516,0.291484
positive,0.501457,0.498543
positive,0.396234,0.603766




In [24]:
# This is a performance report
h2o_model.leader.model_performance(h2o_tfidf_test)


ModelMetricsBinomial: xgboost
** Reported on test data. **

MSE: 0.31978510463995274
RMSE: 0.565495450591738
LogLoss: 0.8671292979918205
Mean Per-Class Error: 0.45664534470504625
AUC: 0.4566453447050462
AUCPR: 0.47080266284168487
Gini: -0.0867093105899076

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.056861039251089096: 


Unnamed: 0,Unnamed: 1,negative,positive,Error,Rate
0,negative,1.0,83.0,0.9881,(83.0/84.0)
1,positive,0.0,67.0,0.0,(0.0/67.0)
2,Total,1.0,150.0,0.5497,(83.0/151.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.056861,0.617512,149.0
1,max f2,0.056861,0.801435,149.0
2,max f0point5,0.056861,0.502249,149.0
3,max accuracy,0.857004,0.589404,12.0
4,max precision,0.916136,1.0,0.0
5,max recall,0.056861,1.0,149.0
6,max specificity,0.916136,1.0,0.0
7,max absolute_mcc,0.497644,0.209723,82.0
8,max min_per_class_accuracy,0.507783,0.404762,77.0
9,max mean_per_class_accuracy,0.857004,0.543355,12.0



Gains/Lift Table: Avg response rate: 44.37 %, avg score: 51.79 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.013245,0.91068,2.253731,2.253731,1.0,0.913557,1.0,0.913557,0.029851,0.029851,125.373134,125.373134,0.029851
1,2,0.02649,0.909937,1.126866,1.690299,0.5,0.910159,0.75,0.911858,0.014925,0.044776,12.686567,69.029851,0.032871
2,3,0.033113,0.903634,0.0,1.352239,0.0,0.905928,0.6,0.910672,0.0,0.044776,-100.0,35.223881,0.020967
3,4,0.046358,0.898739,1.126866,1.287846,0.5,0.900039,0.571429,0.907634,0.014925,0.059701,12.686567,28.784648,0.023987
4,5,0.05298,0.892521,2.253731,1.408582,1.0,0.896967,0.625,0.906301,0.014925,0.074627,125.373134,40.858209,0.038913
5,6,0.10596,0.82525,1.408582,1.408582,0.625,0.85927,0.625,0.882785,0.074627,0.149254,40.858209,40.858209,0.077825
6,7,0.152318,0.790667,0.643923,1.17586,0.285714,0.800652,0.521739,0.857788,0.029851,0.179104,-35.607676,17.585983,0.048152
7,8,0.205298,0.743851,0.845149,1.090515,0.375,0.761889,0.483871,0.83304,0.044776,0.223881,-15.485075,9.051517,0.033404
8,9,0.304636,0.651912,0.751244,0.979883,0.333333,0.695316,0.434783,0.78813,0.074627,0.298507,-24.875622,-2.011681,-0.011016
9,10,0.403974,0.600331,0.450746,0.849768,0.2,0.625274,0.377049,0.748083,0.044776,0.343284,-54.925373,-15.023244,-0.109097







In [26]:
best_model = h2o.get_model(h2o_model.leader.model_id)
best_model.confusion_matrix()


Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.46533888578414917: 


Unnamed: 0,Unnamed: 1,negative,positive,Error,Rate
0,negative,266.0,46.0,0.1474,(46.0/312.0)
1,positive,27.0,292.0,0.0846,(27.0/319.0)
2,Total,293.0,338.0,0.1157,(73.0/631.0)




In [27]:
# This outputs the model as a mojo file which is meant to be a binary object 
# that can work in other languages
h2o_model.leader.download_mojo(path = "/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/H2O")

'/content/drive/MyDrive/CIT/FYP/ImplementationFiles/ExportedModels/H2O/XGBoost_2_AutoML_20210224_221844.zip'

In [None]:

# def main():
#   # X_train, y_train, X_test, y_test = prepare_data()
#   feature_vector, labels = prepare_data()

#   # Convert feature_vectors into a pandas dataframe of 
#   # term frequency inverse document frequency of each word
#   tfidf_tf = pd.DataFrame(feature_vector)
  
#   # Add the labels
#   tfidf_tf['labels'] = labels

#   #==========================================
#   # Insert the code for running the libraries in here
#   h2o_tfidf = h2o.H2OFrame(tfidf_tf)
#   # print(h2o_tfidf.head())

#   y = "labels"
#   x = h2o_tfidf.columns
#   x.remove(y)

#   h2o_tfidf_train, h2o_tfidf_test, h2o_tfidf_valid = h2o_tfidf.split_frame(ratios=[0.7, 0.15])
 
#   from h2o.automl import H2OAutoML
#   aml = H2OAutoML(max_models = 10, seed = 10, exclude_algos = ["StackedEnsemble", "DeepLearning"], verbosity="info", nfolds=0)
  
#   aml.train(x = x, y = y, training_frame = h2o_tfidf_train, validation_frame=h2o_tfidf_valid)

#   lb = aml.leaderboard
  
#   lb.head()


#   # Train the automl, x sets the training columns, y is the y columns
#   # automodel.train(x = x, y = y, training_frame = imdb_train, validation_frame=imdb_valid)
#   # aml = H2OAutoML(max_models=20, seed=1)
  

#   #==========================================

# main()