<a href="https://colab.research.google.com/github/CianOSull/AutoML_With_SA_FYP_2021/blob/MLBox/Copy_of_MLBox_MainNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generic Notebook for running all the libraries

How this notebook works is that it contains the code
for loading and cleaning the dataset.

Then there is multiple branches created on the
Github that include the code for running each library.

E.g. MLBox branch has the code for running MLBox.

# CURRENT BRANCH: MLBox

# Install the necessary library
Run the install code in the code cell below.

In [None]:
# Insert any install comamnds in this cell
!pip install mlbox

Collecting mlbox
  Downloading https://files.pythonhosted.org/packages/7a/26/6236ca21e762067fbb7a6cd388fc9812380af88ae007ca42da9ef6384ed8/mlbox-0.8.5.tar.gz
Collecting numpy==1.18.2
[?25l  Downloading https://files.pythonhosted.org/packages/b7/ce/d0b92f0283faa4da76ea82587ff9da70104e81f59ba14f76c87e4196254e/numpy-1.18.2-cp37-cp37m-manylinux1_x86_64.whl (20.2MB)
[K     |████████████████████████████████| 20.2MB 1.4MB/s 
Collecting matplotlib==3.0.3
[?25l  Downloading https://files.pythonhosted.org/packages/83/2a/e47bbd9396af32376863a426baed62d9bf3091f81defd1fe81c5f33b11a3/matplotlib-3.0.3-cp37-cp37m-manylinux1_x86_64.whl (13.0MB)
[K     |████████████████████████████████| 13.0MB 31.6MB/s 
[?25hCollecting hyperopt==0.2.3
[?25l  Downloading https://files.pythonhosted.org/packages/7e/11/8bbbb5edb78c40a2bd0f6b730e3dc0f29ffbaea9a59520eb9622951e9151/hyperopt-0.2.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 47.6MB/s 
[?25hCollecting pandas==0.25.3
[?25l  Dow

# Preprocessing Section

In [None]:
# Import the necessary modules for cleaning
import math
import time 
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Download the necessary parts for the NLTK module
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Model Section

# Example code:

https://www.kaggle.com/axelderomblay/running-mlbox-auto-ml-package-on-titanic

# Youtube code:
https://www.youtube.com/watch?v=omd8SazsHaI&t=106s

https://github.com/sergeiissaev/youtube/blob/master/mlbox.ipynb

In [None]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

# From Docs

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# A dense numpy array of the sparse matrix outputted can be made by using:
# X_train.toarray()
def create_data(num, process):  
  imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")

  # the next step is to randomize the rows of the data
  imdb_df = imdb_df.sample(frac=1).reset_index(drop=True)

  imdb_df['sentiment'] = imdb_df['sentiment'].map({'negative' : 0, 'positive' : 1})

  features = imdb_df.review.values[0:num]
  labels = imdb_df.sentiment.values[0:num]

  no_samples = 0.8

  # This gets the percentage of indexes from feature vector and uses those for training
  train_txt = features[0:int(no_samples*len(features))]
  y_train = labels[0:int(no_samples*len(labels))]

  # Go from the index that was used for training to the final
  test_txt = features[int(no_samples*len(features)):len(features)]
  y_test = labels[int(no_samples*len(labels)):len(labels)]

  # This cell has the sk learn functions
  # Allows you to choose which function you want to use
  if process == 0:
    ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

    ctv.fit(features)

    X_train_sparse = ctv.transform(train_txt)
    X_test_sparse = ctv.transform(test_txt)

  else:
    tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

    tfv.fit(features)

    X_train_sparse = tfv.transform(train_txt)
    X_test_sparse = tfv.transform(test_txt)

  # Converting to numpy arrays for more generic format
  X_train = X_train_sparse.toarray()
  X_test = X_test_sparse.toarray()

  return X_train, y_train, X_test, y_test

In [None]:
# Size = 10,000 failed
# Size = 3,000 failed
# Size 2,500 seems to be limit
X_train, y_train, X_test, y_test = create_data(500, 1)

# Dual Column dataframe way
# # Convert feature_vectors into a pandas dataframe of 
# # term frequency inverse document frequency of each word
# tfidf_train = pd.DataFrame(columns = ['features', 'labels'])

# # This is a dataframe with each row having a list
# for i in range(len(X_train)):
#   tfidf_train.loc[i] = [X_train[i].tolist()] + [y_train[i]]

# # Convert feature_vectors into a pandas dataframe of 
# # term frequency inverse document frequency of each word
# tfidf_test = pd.DataFrame(columns = ['features'])

# # This is a dataframe with each row having a list
# for i in range(len(X_test)):
#   tfidf_test.loc[i] = [X_test[i].tolist()]

# =================================================================
# Multiple column dataframe way
tfidf_train = pd.DataFrame(X_train)
tfidf_test =  pd.DataFrame(X_test)

tfidf_train['labels'] = y_train
tfidf_test['labels'] = y_test

print(tfidf_train.head())
print(tfidf_test.head())
print(len(tfidf_train))
print(len(tfidf_test))

tfidf_train.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", index=False)
tfidf_test.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv", index=False)

          0    1    2    3    4  ...     13565  13566  13567  13568  labels
0  0.000000  0.0  0.0  0.0  0.0  ...  0.000000    0.0    0.0    0.0       0
1  0.000000  0.0  0.0  0.0  0.0  ...  0.000000    0.0    0.0    0.0       0
2  0.044442  0.0  0.0  0.0  0.0  ...  0.000000    0.0    0.0    0.0       1
3  0.000000  0.0  0.0  0.0  0.0  ...  0.287005    0.0    0.0    0.0       1
4  0.000000  0.0  0.0  0.0  0.0  ...  0.000000    0.0    0.0    0.0       1

[5 rows x 13570 columns]
     0    1    2    3    4    5  ...  13564  13565  13566  13567  13568  labels
0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0       0
1  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0       1
2  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0       1
3  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0       0
4  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0       1

[5 rows x 13570 columns]
400
100


In [None]:
# Size = 10,000 failed
# Size = 3,000 failed
# Size 2,500 seems to be limit
X_train, y_train, X_test, y_test = create_data(500, 1)

# Dual Column dataframe way
# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
tfidf_train = pd.DataFrame(columns = ['features', 'labels'])

# This is a dataframe with each row having a list
for i in range(len(X_train)):
  tfidf_train.loc[i] = [X_train[i].tolist()] + [y_train[i]]

# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
tfidf_test = pd.DataFrame(columns = ['features'])

# This is a dataframe with each row having a list
for i in range(len(X_test)):
  tfidf_test.loc[i] = [X_test[i].tolist()]

print(tfidf_train.head())
print(tfidf_test.head())
print(len(tfidf_train))
print(len(tfidf_test))

# tfidf_train.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", index=False)
# tfidf_test.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv", index=False)

                                            features labels
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...      1
1  [0.13991044080497225, 0.0, 0.0, 0.0, 0.0, 0.0,...      0
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...      1
3  [0.04381093567293872, 0.0, 0.0, 0.0, 0.0, 0.0,...      1
4  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0668575653297...      0
                                            features
0  [0.09827054713613577, 0.0, 0.0, 0.0, 0.0, 0.0,...
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0693357444139...
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0817385506729...
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0434669084241377, ...
4  [0.04637218434805809, 0.0, 0.0, 0.0, 0.0, 0.0,...
400
100


In [None]:
paths = ["/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", "/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv"]
target_name = "labels"

In [None]:
data = Reader(sep=",").train_test_split(paths, target_name)
# data = Drift_thresholder().fit_transform(data)


reading csv : tfidf_train.csv ...
cleaning data ...
CPU time: 35.53465175628662 seconds

reading csv : tfidf_test.csv ...
cleaning data ...
CPU time: 27.13295078277588 seconds

You have no test dataset !

> Number of common features : 13569

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 0
> Number of numerical features: 13569
> Number of training samples : 500
> Number of test samples : 0

> You have no missing values on train set...

> Task : classification
0.0    258
1.0    242
Name: labels, dtype: int64

encoding target ...



# Basic with params optimise possible pipeline
---

# BIG NOTE
Dual Column dataframe works.

But predictor doesn't. Look into a way of fixing this this, one idea is maybe copy the parameters found and  recreate it myself by setting the paramters to be the space.

In [None]:
# example space
space = {

        'ne__numerical_strategy' : {"space" : [0, 'mean']},

        'ce__strategy' : {"space" : ["label_encoding", "random_projection", "entity_embedding"]},

        'fs__strategy' : {"space" : ["variance", "rf_feature_importance"]},
        'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3]},

        'est__strategy' : {"space" : ["LightGBM"]},
        'est__max_depth' : {"search" : "choice", "space" : [5,6]},
        'est__subsample' : {"search" : "uniform", "space" : [0.6,0.9]}

        }

# Titanic example
# space = {
    
#         'est__strategy':{"search":"choice",
#                                   "space":["LightGBM"]},    
#         'est__n_estimators':{"search":"choice",
#                                   "space":[150]},    
#         'est__colsample_bytree':{"search":"uniform",
#                                   "space":[0.8,0.95]},
#         'est__subsample':{"search":"uniform",
#                                   "space":[0.8,0.95]},
#         'est__max_depth':{"search":"choice",
#                                   "space":[5,6,7,8,9]},
#         'est__learning_rate':{"search":"choice",
#                                   "space":[0.07]} 
    
#         }

# Example found on kaggle
# space = {
    
#         'est__strategy':{"search":"choice",
#                                   "space":["LightGBM"]},    
#         'est__n_estimators':{"search":"choice",
#                                   "space":[700]},    
#         'est__colsample_bytree':{"search":"uniform",
#                                   "space":[0.77,0.82]},
#         'est__subsample':{"search":"uniform",
#                                   "space":[0.73,0.8]},
#         'est__max_depth':{"search":"choice",
#                                   "space":[5,6,7]},
#         'est__learning_rate':{"search":"uniform",
#                                   "space":[0.008, 0.02]} 
    
#         }


opt = Optimiser(scoring = "accuracy", n_folds = 5)

best = opt.optimise(space, data, max_evals = 5)

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'label_encoding'}
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.3}
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'subsample': 0.7525603611553028, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

  +str(self.to_path)+"/joblib'. Please clear it regularly.")
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])



MEAN SCORE : accuracy = 0.7220000000000001
VARIANCE : 0.059464274989273994 (fold 1 = 0.66, fold 2 = 0.68, fold 3 = 0.82, fold 4 = 0.69, fold 5 = 0.76)
CPU time: 26.13408851623535 seconds
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'random_projection'}
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.2}
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'subsample': 0.6215741815476972, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample_for_bin': 200000, 'subsample_freq': 

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])



MEAN SCORE : accuracy = 0.722
VARIANCE : 0.04874423042781576 (fold 1 = 0.68, fold 2 = 0.67, fold 3 = 0.79, fold 4 = 0.7, fold 5 = 0.77)
CPU time: 25.007959127426147 seconds
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'entity_embedding'}
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.3}
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'subsample': 0.8964094191540564, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 's

In [None]:
Predictor().fit_predict(best, data)


fitting the pipeline ...


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


CPU time: 6.131330251693726 seconds




<mlbox.prediction.predictor.Predictor at 0x7f7e29bdb410>

Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x7f7e614badd0> (for post_execute):


ValueError: ignored

ValueError: ignored

<Figure size 1440x293112 with 1 Axes>

# How to get predicitons:

Predictions are saved to folder called save so we need to read them in

In [None]:
preds = pd.read_csv("save/labels_predictions.csv")

In [None]:
print(preds["labels_predicted"].values)
y_pred = preds["labels_predicted"].values

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))



---
---
---
---









# OLD CODE (IGNORE)



---

---

---

---

---

In [None]:
# Create the set of stopwords for cleaning text
stopwords = set(w.rstrip() for w in open('/content/drive/MyDrive/CIT/FYP/ImplementationFiles/stopwords.txt'))

In [None]:
# This funciton handles celaning text
def clean_text(text):
    # Create the lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # Get rid of non alpha characters except "'" as it is needed for the lemment
    text = "".join(c for c in text if c.isalnum() or c == " " or "'")
    
    # Get rid of capitals
    text = text.lower()
    
    # Tokenize the words    
    # Create tokens of each word
    token_text = word_tokenize(text)
    
    # Get rid of any piece of text that isn't over 2 characters
    token_text = [t for t in token_text if len(t) > 2] 
    
    # Put words in base form by doing lemmatization
    token_text = [wordnet_lemmatizer.lemmatize(t) for t in token_text]

    # Remove stopwords
    token_text = [t for t in token_text if t not in stopwords]
    
    # Return the tokens
    return token_text

In [None]:
# This function will get the term frequencies for word in the review
# TF = Term I frequency in document/total words in document
def calc_tf(term_count, review_corpus):
    # A dictionary of all the term frequencies found
    tf_freq = dict.fromkeys(term_count.keys(), 0)   
    
    # Review corpus is a tokenized list so the total words iteh length
    total_words = len(review_corpus)
    
    # Calculate the term frequency for each word
    for word, count in term_count.items():
        tf_freq[word] = count/total_words
        
    return tf_freq

In [None]:
# This calcualtes the idf
# IDF = log(2)*(Total number of Documents/documents frequency or documents with term)
def calc_idf(unique_terms, list_doc_terms):   
    # A dicitonary of all the inverse document frequencies
    idf = dict.fromkeys(unique_terms, 0)
    
    # Basically list_doc_terms has all the documents with the term count for each word
    # You go through each document count the terms where they occured
    for doc_terms in list_doc_terms:  
        # This for loop is counting the amount of document a word was in
        for word, value in doc_terms.items():
            if 0 < value:
                idf[word] += 1
        
    # Now we calculate idf
    for word, value in idf.items():
        idf[word] = math.log10(10 / float(value))
    
    return idf

In [None]:
# Modified this function to return a list as dictionaries arn't needed anymore
def calc_tf_idf(tf, idf, n_terms):
    # Create an array that is of length of the number of unique terms
    tf_idf_array = np.zeros(n_terms)
    
    for index, (word, value) in enumerate(tf.items()):
        # Add the tfidf to the array
        tf_idf_array[index] = value*idf[word]
    
    return tf_idf_array

In [None]:
def process_text(text_data):
    # A list of all the cleaned reviews
    doc_list = []
    
    # List of all the unique terms
    unique_terms = []
    
    # A list of all the term frequencies
    tf_list = []
    
    for review in text_data:
        # First clean the review
        clean_review = clean_text(review)
        
        # Keeps track of the term counts for each word
        count_dict = {}
        
        # Now lets find the total count for each word
        for token in clean_review:
            if token not in count_dict:
                count_dict[token] = 1
            else:
                count_dict[token] += 1
        
        # Caclulate the term frequencies for each document
        tf_list.append(calc_tf(count_dict, clean_review))
        
        # Then add the dictionary of counts for each document to the list
        doc_list.append(count_dict)
        
        # Then add the new unique terms
        unique_terms = set(unique_terms).union(set(clean_review))
    
    # Calculate the inverse document frequency value
    idf = calc_idf(unique_terms, doc_list)
    
    # This array will contain the tfidf values for each term in each review
    tfidf_values = np.zeros((len(tf_list), len(unique_terms)))
    
    # Now we can get the TFIDF for each document
    for index, term_freq in enumerate(tf_list):
        # This will return an array of the tfidf values calculated.
        # The length of the unique terms list is passed in so that the 
        # Array that is returned matches the tfidf array
        tf_idf_array = calc_tf_idf(term_freq, idf, len(unique_terms))
        # Add this to the overall tfidf values calculated
        tfidf_values[index,:] = tf_idf_array
    
    return tfidf_values

In [None]:
# Prepare the data
def prepare_data(num):
    print("="*50)

    # Load the dataset
    # imdb_df = pd.read_csv("IMDB Dataset.csv")
    imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")
    print("Dataset loaded")
    print("="*50)

    # Change each positive and negative value to 1 and 0 respectively    
    imdb_df['sentiment'] = imdb_df['sentiment'].map({'negative' : 0, 'positive' : 1})
    
    # # For testing, a much smaller dataset is going to be used
    # imdb_df = imdb_df.head(5000)

    # Group all the negative reviews and get the first 2500
    imdb_df_neg = (imdb_df[imdb_df['sentiment'] == 0])[0:num]
    # imdb_df_neg = (imdb_df[imdb_df['sentiment'] == "negative"])[0:num]
    
    # Group all the positive and get the first 2500
    imdb_df_pos = imdb_df[imdb_df['sentiment'] == 1][0:num]
    # imdb_df_pos = imdb_df[imdb_df['sentiment'] == "positive"][0:num]
    
    test_df = pd.concat([imdb_df_neg, imdb_df_pos]) 
    # print(test_df)
    
    # .values on a column of a dataframe returns a numpy array
    # This is a numpy array of all the reviews
    # initial_reviews = imdb_df['review'].values
    initial_reviews = test_df['review'].values
    
    print("Creating Feature Vector")
    print("="*50)
    start = time.time()
    # Process the text data and create teh feature vector
    feature_vector = process_text(initial_reviews)
    end = time.time()
    print("Feature Vector Created")
    print(len(feature_vector))
    print(f"Execution time is {end - start} secs")
    print("="*50)

    # This is a numpy array of all the positive and negativelabels
    # labels = imdb_df['sentiment'].values
    labels = test_df['sentiment'].values
    
    # Shuffle the labesl and feature vector using sklearn shuffle
    feature_vector, labels = shuffle(feature_vector, labels)
    
    # Creating train and test data
    # The splits will be 80:20 
    no_samples = 0.8
    
    # This gets the percentage of indexes from feature vector and uses those for training
    X_train = feature_vector[0:int(no_samples*len(feature_vector))]
    y_train = labels[0:int(no_samples*len(labels))]
    
    # Go from the index that was used for training to the final
    X_test = feature_vector[int(no_samples*len(feature_vector)):len(feature_vector)]
    y_test = labels[int(no_samples*len(labels)):len(labels)]

    return X_train, y_train, X_test, y_test

    # return feature_vector, labels

# Examples

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# A dense numpy array of the sparse matrix outputted can be made by using:
# X_train.toarray()
def create_data(num, process):  
  imdb_df = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/IMDB_Dataset.csv")

  # the next step is to randomize the rows of the data
  imdb_df = imdb_df.sample(frac=1).reset_index(drop=True)

  imdb_df['sentiment'] = imdb_df['sentiment'].map({'negative' : 0, 'positive' : 1})

  features = imdb_df.review.values[0:num]
  labels = imdb_df.sentiment.values[0:num]

  no_samples = 0.8

  # This gets the percentage of indexes from feature vector and uses those for training
  train_txt = features[0:int(no_samples*len(features))]
  y_train = labels[0:int(no_samples*len(labels))]

  # Go from the index that was used for training to the final
  test_txt = features[int(no_samples*len(features)):len(features)]
  y_test = labels[int(no_samples*len(labels)):len(labels)]

  # This cell has the sk learn functions
  # Allows you to choose which function you want to use
  if process == 0:
    ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

    ctv.fit(features)

    X_train_sparse = ctv.transform(train_txt)
    X_test_sparse = ctv.transform(test_txt)

  else:
    tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

    tfv.fit(features)

    X_train_sparse = tfv.transform(train_txt)
    X_test_sparse = tfv.transform(test_txt)

  # Converting to numpy arrays for more generic format
  X_train = X_train_sparse.toarray()
  X_test = X_test_sparse.toarray()

  return X_train, y_train, X_test, y_test

In [None]:
# Size = 10,000 failed
# Size = 3,000 failed
# Size 2,500 seems to be limit
X_train, y_train, X_test, y_test = create_data(2500, 1)

# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
tfidf_train = pd.DataFrame(columns = ['features', 'labels'])

# This is a dataframe with each row having a list
for i in range(len(X_train)):
  tfidf_train.loc[i] = [X_train[i].tolist()] + [y_train[i]]

# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
tfidf_test = pd.DataFrame(columns = ['features'])

# This is a dataframe with each row having a list
for i in range(len(X_test)):
  tfidf_test.loc[i] = [X_test[i].tolist()]

print(tfidf_train.head())
print(tfidf_test.head())
print(len(tfidf_train))
print(len(tfidf_test))

tfidf_train.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", index=False)
tfidf_test.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv", index=False)

# Old Text

In [None]:
X_train, y_train, X_test, y_test = prepare_data(500)

# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
tfidf_train = pd.DataFrame(columns = ['features', 'labels'])

# This is a dataframe with each row having a list
for i in range(len(X_train)):
  tfidf_train.loc[i] = [X_train[i].tolist()] + [y_train[i]]

# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
tfidf_test = pd.DataFrame(columns = ['features'])

# This is a dataframe with each row having a list
for i in range(len(X_test)):
  tfidf_test.loc[i] = [X_test[i].tolist()]

print(tfidf_train.head())
print(tfidf_test.head())
print(len(tfidf_train))
print(len(tfidf_test))

tfidf_train.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", index=False)
tfidf_test.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv", index=False)

In [None]:
paths = ["/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", "/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv"]
target_name = "labels"

# Create model based on Titanic example

In [None]:
# Read and clean files
rd = Reader(sep = ",")
df = rd.train_test_split(paths, target_name)

In [None]:
# This can remove non stable features but probably shouldn't run at all for now
# dft = Drift_thresholder()
# df = dft.fit_transform(df)

In [None]:
opt = Optimiser(scoring = "accuracy", n_folds = 5)

In [None]:
space = {
    
        'est__strategy':{"search":"choice",
                                  "space":["LightGBM"]},    
        'est__n_estimators':{"search":"choice",
                                  "space":[150]},    
        'est__colsample_bytree':{"search":"uniform",
                                  "space":[0.8,0.95]},
        'est__subsample':{"search":"uniform",
                                  "space":[0.8,0.95]},
        'est__max_depth':{"search":"choice",
                                  "space":[5,6,7,8,9]},
        'est__learning_rate':{"search":"choice",
                                  "space":[0.07]} 
    
        }

print("Creating Fit the model")
print("="*50)
start = time.time()
params = opt.optimise(space, df,15)
end = time.time()
print("="*50)
print("Model Fitting Finished")
print(f"Execution time is {end - start} secs")
print("="*50)

In [None]:
prd = Predictor()
prd.fit_predict(params, df)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = predict_output_vals
testing_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy score {0}".format(testing_accuracy))

In [None]:
from sklearn import metrics

# paramters are y_true, y_pred
C = metrics.confusion_matrix(y_test, y_pred)

# lists for the confusion matrix
true_positive = []
true_negative = []
false_postiive = []
false_negatives = []

true_positive.append(C[0,0])
true_negative.append(C[1,1])            
false_postiive.append(C[1,0])
false_negatives.append(C[0,1])

print(C)

print(("="*50))

print("True positives:", round(np.sum(true_positive)/len(y_test), 5), "%")
print("True negatives:", round(np.sum(true_negative)/len(y_test), 5), "%")
print("False positives:", round(np.sum(false_postiive)/len(y_test), 5), "%")
print("False negatives:", round(np.sum(false_negatives)/len(y_test), 5), "%")

In [None]:
true_positive = C[0,0]
true_negative = C[1,1]            
false_postiive = C[1,0]
false_negatives = C[0,1]

precision = true_positive/(true_positive+false_postiive)
recall = true_positive/(true_positive/false_negatives)

In [None]:
f1_score = (precision*recall)/(precision+recall)

print(f1_score)

In [None]:
fpr, tpr, thresh = metrics.roc_curve(y_test, y_pred)

auc = metrics.auc(fpr, tpr)

print("AUC:", auc)

In [None]:
import matplotlib.pyplot as plt

plt.plot(fpr, tpr, label='ROC curve (area = %.2f)' %auc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
plt.title('ROC curve MLBox')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.legend()
plt.show()

# ===================================================

# Old

In [None]:
predict_output = pd.read_csv("save/labels_predictions.csv")
print(predict_output.head())
# print(predict_output["labels_predicted"].values)
predict_output_vals = (predict_output["labels_predicted"].values)
print(predict_output_vals)

# ==============================================================

# Text data values

In [None]:
X_train, y_train, X_test, y_test = prepare_data(1000)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

# MLBOX EXample

In [None]:
# This is a dataframe with a column for each value
tfidf_train = pd.DataFrame(X_train)

# Add the labels
tfidf_train['labels'] = y_train

# This is a dataframe with a column for each value
tfidf_test = pd.DataFrame(X_test)

# # Add the labels
# tfidf_test['labels'] = y_test

print(tfidf_train.head())
print(tfidf_test.head())

tfidf_train.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", index=False)
tfidf_test.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv", index=False)

In [None]:
# This is a dataframe with a column for each value
tfidf_train = pd.DataFrame(X_train)

# Add the labels
tfidf_train['labels'] = y_train

# This is a dataframe with a column for each value
tfidf_test = pd.DataFrame(X_test)

# # Add the labels
# tfidf_test['labels'] = y_test

print(tfidf_train.head())
print(tfidf_test.head())

tfidf_train.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_train.csv", index=False)
tfidf_test.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_test.csv", index=False)

In [None]:
data = Reader(sep=",").train_test_split(paths, target_name)  
# data = Drift_thresholder().fit_transform(data)

In [None]:
opt = Optimiser().evaluate(None, data)

In [None]:
space = {

        'ne__numerical_strategy' : {"space" : [0, 'mean']},

        'ce__strategy' : {"space" : ["label_encoding", "random_projection", "entity_embedding"]},

        'fs__strategy' : {"space" : ["variance", "rf_feature_importance"]},
        'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3]},

        'est__strategy' : {"space" : ["LightGBM"]},
        'est__max_depth' : {"search" : "choice", "space" : [5,6]},
        'est__subsample' : {"search" : "uniform", "space" : [0.6,0.9]}

        }

params = opt.optimise(space, df, 15)
bestsd = opt.optimise(space, data, max_evals = 5)

In [None]:
Predictor().fit_predict(best, data)

# Output Pandsa Dataframe

In [None]:
# Importing data using pandas
import pandas as pd

feature_vector, labels = prepare_data(500)

# Convert feature_vectors into a pandas dataframe of 
# term frequency inverse document frequency of each word
# tfidf_tf = pd.DataFrame(feature_vector)
tfidf_tf_2col = pd.DataFrame(columns = ['features', 'labels'])

# This is a dataframe with each row having a list
for i in range(len(feature_vector)):
  tfidf_tf_2col.loc[i] = [feature_vector[i].tolist()] + [labels[i]]

# This is a dataframe with a column for each value
tfidf_tf_mult = pd.DataFrame(feature_vector)

# Add the labels
tfidf_tf_mult['labels'] = labels

#==========================================

In [None]:
print(tfidf_tf.head())
tfidf_tf.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_2col.csv", index=False)

print(tfidf_tf_mult.head())
tfidf_tf_mult.to_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_tf_mult.csv", index=False)

In [None]:
load_ouput_csv_2col = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_2col.csv")

print(load_ouput_csv_2col.head())

In [None]:
load_ouput_csv_mult = pd.read_csv("/content/drive/MyDrive/CIT/FYP/ImplementationFiles/tfidf_tf_mult.csv")
print(load_ouput_csv_mult.head())

In [None]:
print("Keep VM")