In [1]:
# ---------------------------------------------------------------------------- #
# --------------------------------- IMPORTS ---------------------------------- #
# ---------------------------------------------------------------------------- #
# Standard Library Imports
import io
import json
import math
import os
import pickle
import re
import time as time
import xml.etree.ElementTree as et

from collections import Counter
from datetime import datetime
from pathlib import Path

# Dependency Imports
import nltk
import numpy as np
import pandas as pd
import scipy.io as sio
import xgboost as xgb

from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import roc_auc_score, f1_score, make_scorer, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier







In [None]:
# ---------------------------------------------------------------------------- #
# ------------------------------ CONFIGURATIONS ------------------------------ #
# ---------------------------------------------------------------------------- #
ROOT = "C:/Users/TheLunes/Documents/masters_thesis/TaCLE/"

config = {
    "package-info" : {
        "Project" : "Tool-assisted Classification using Lexical Elements",
        "Author" : "Braeden Lewis",
        "Language" : "Python v3.10.1",

        "dependencies" : [
            "nltk",
            "numpy",
            "pandas",
            "scipy",
            "scikit-learn",
            "xgboost"
        ]
    },
    "directories" : {
        "ROOT" : ROOT,
        "V0_DATA_IMPORT_DIR" : Path(ROOT + "/data/input/"),
        "EXTR_PICKLE_OUTPUT_DIR" : Path(ROOT + "/data/output/pickle-files/extraction/"),
        "NLP_PICKLE_OUTPUT_DIR" : Path(ROOT + "/data/output/pickle-files/nlp/"),
        "MLMODEL_PICKLE_OUTPUT_DIR" : Path(ROOT + "/data/output/pickle-files/mach-learning/"),
        "DESC_STATS_OUTPUT_DIR" : Path(ROOT + "/data/output/csv-files/"),
        "RMD_OUTPUT_DIR" : Path(ROOT + "/data/output/rmd-files/"),
        "TIDY_OUTPUT_DIR" : Path(ROOT + "/data/output/tidy-files/")
    },
    "model-hyperparameters" : {
        "DECISION_TREE_PARAMETERS": {
            "criterion" : "gini",
            "splitter" : "best",
            "max_depth" : 10,
            "max_features" : "sqrt",
            "min_samples_split" : 3,
            "min_samples_leaf" : 4,
            "class_weight" : "balanced"   
        },
        "LOG_REG_PARAMETERS" : {
            "penalty" : "l2",
            "C" : 11.288378916846883,
            "solver" : "liblinear",
            "max_iter": 500
        },
        "SVM_PARAMETERS" : {
            "kernel" : "linear",
            "C" : 0.0018329807108324356,
            "degree" : 0,
            "gamma" : "scale",
            "cache_size" : 100,
            "class_weight" : "balanced"
        },
        "KNN_PARAMETERS" : {
            "n_neighbors": 20,
            "weights" : "distance",
            "algorithm" : "auto",
            "leaf_size" : 10,
            "p" : 2,
            "n_jobs" : -1
        },
         "XGB_PARAMETERS" : {
            "learning_rate" : 0.1,
            "max_depth" : 2,
            "min_child_weight" : 2,
            "n_estimators" : 200,
            "nthread" : 1,
            "objective" : "binary:logistic", #"multi:softmax", # use "binary:logistic" for binary tasks, "multi:softmax" for multiclass
            "gamma" : 0.2,
            "subsample" : 0.3,
            "colsample_bytree" : 0.2,
            "eval_metric" : "mlogloss",
            "use_label_encoder" : False
        }   
    },
    "parameter-tuning" : {
        "DECISION_TREE_TUNING" : {
            "criterion" : ["gini", "entropy"],
            "splitter" : ["best", "random"],
            "max_depth" : [5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70],
            "min_samples_split" : [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16],
            "min_samples_leaf" : [2, 3, 4, 5, 6, 7, 8, 9, 10],
            "max_features" : ["auto", "sqrt"],
            "class_weight" : ["balanced"]
        },
        "LOG_REG_TUNING" : [
            {"penalty": ["l1", "l2"],
             "C" : np.logspace(-4, 4, 20),
             "solver": ["liblinear", "saga"],
             "max_iter" : [500, 1000, 2000, 5000]},
            {"penalty" : ["l2"],
            "C" : np.logspace(-4, 4, 20),
            "solver" : ["lbfgs", "newton-cg", "sag"],
            "max_iter" : [500, 1000, 2000, 5000]}
        ],
        "SVM_TUNING" : {
            "kernel" : ["linear", "poly", "rbf", "sigmoid"],
            "C" : np.logspace(-4, 4, 20),
            "degree" : [0, 1, 2, 3, 4, 5],
            "gamma" : ["scale", "auto"],
            "cache_size" : [100, 200, 300, 400, 500],
            "class_weight": ["balanced"]    
        },
        "KNN_TUNING" : {
            "n_neighbors" : [5, 10, 15, 20, 25, 30, 35, 40],
            "weights" : ["uniform", "distance"],
            "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"],
            "leaf_size" : [10, 20, 30, 40, 50, 60],
            "p" : [1, 2],
            "n_jobs" : [-1]
        },
        "XGB_TUNING" : {
            "max_depth" : [2, 3, 4, 5],
            "min_child_weight" : [2, 3, 4, 5],
            "n_estimators" : [50, 100, 150, 200],
            "nthread" : [1, 2, 3],
            "objective" : ["binary:logistic"], # binary:logistic for binary tasks, multi:softmax for multiclass
            "gamma" : [0.1, 0.2, 0.3],
            "subsample" : [0.1, 0.2, 0.3],
            "colsample_bytree" : [0.1, 0.2, 0.3],
            "eval_metric" : ["mlogloss"],
            "use_label_encoder" : [False] 
        }
    },
    "RUN_DATETIME" : datetime.now().strftime("%Y%m%d-%H%M%S"),
    "NGRAM" : (1, 3),
    "DETECTABLE_CLASSES": ["BREAST", "BOTTLE", "EXPRESS/PUMP", "NA"],
    "CONCAT_CLASS" : {"FEEDING": ["BREAST", "BOTTLE", "EXPRESS/PUMP"]},
    "MATRIX_TYPE" : "tf-idf", # Can be "tf-idf" or "count"
    "REFINEMENT" : "none", # Can be "shared", "unique", or "none" (default)
    "MIN_DOC_FREQ": 30,
    "TEST_SIZE" : 0.20,
    "VALIDATION_SIZE" : 0.25,
    "CROSS_VALIDATIONS": 5,
    "RANDOM_STATE" : 22
    
}

In [None]:

model_hyperparameters = {
    
    "trial-1-uni": {
        "DECISION_TREE_PARAMETERS": {
            "criterion" : "entropy",
            "splitter" : "best",
            "max_depth" : 50,
            "max_features" : "auto",
            "min_samples_split" : 14,
            "min_samples_leaf" : 8,
            "class_weight" : "balanced"   
        },
        "LOG_REG_PARAMETERS" : {
            "penalty" : "l2",
            "C" : 11.288378916846883,
            "solver" : "liblinear",
            "max_iter": 500
        },
        "SVM_PARAMETERS" : {
            "kernel" : "poly",
            "C" : 4.281332398719396,
            "degree" : 1,
            "gamma" : "scale",
            "cache_size" : 100,
            "class_weight" : "balanced"
        },
        "KNN_PARAMETERS" : {
            "n_neighbors": 10,
            "weights" : "distance",
            "algorithm" : "ball_tree",
            "leaf_size" : 10,
            "p" : 2,
            "n_jobs" : -1
        },
         "XGB_PARAMETERS" : {
            "learning_rate" : 0.1,
            "max_depth" : 3,
            "min_child_weight" : 2,
            "n_estimators" : 100,
            "nthread" : 1,
            "objective" : "binary:logistic", #"multi:softmax", # use "binary:logistic" for binary tasks, "multi:softmax" for multiclass
            "gamma" : 0.2,
            "subsample" : 0.3,
            "colsample_bytree" : 0.3,
            "eval_metric" : "mlogloss",
            "use_label_encoder" : False
        }
    },
    
    "trial-1-hybrid": {
        "DECISION_TREE_PARAMETERS": {
            "criterion" : "entropy",
            "splitter" : "random",
            "max_depth" : 50,
            "max_features" : "auto",
            "min_samples_split" : 12,
            "min_samples_leaf" : 3,
            "class_weight" : "balanced"   
        },
        "LOG_REG_PARAMETERS" : {
            "penalty" : "l2",
            "C" : 11.288378916846883,
            "solver" : "liblinear",
            "max_iter": 500
        },
        "SVM_PARAMETERS" : {
            "kernel" : "sigmoid",
            "C" : 1.623776739188721,
            "degree" : 0,
            "gamma" : "scale",
            "cache_size" : 100,
            "class_weight" : "balanced"
        },
        "KNN_PARAMETERS" : {
            "n_neighbors": 10,
            "weights" : "distance",
            "algorithm" : "auto",
            "leaf_size" : 10,
            "p" : 2,
            "n_jobs" : -1
        },
         "XGB_PARAMETERS" : {
            "learning_rate" : 0.1,
            "max_depth" : 4,
            "min_child_weight" : 2,
            "n_estimators" : 100,
            "nthread" : 1,
            "objective" : "binary:logistic", #"multi:softmax", # use "binary:logistic" for binary tasks, "multi:softmax" for multiclass
            "gamma" : 0.3,
            "subsample" : 0.3,
            "colsample_bytree" : 0.2,
            "eval_metric" : "mlogloss",
            "use_label_encoder" : False
        } 
    },
    
    "trial-2-uni": {
        "DECISION_TREE_PARAMETERS": {
            "criterion" : "entropy",
            "splitter" : "best",
            "max_depth" : 5,
            "max_features" : "auto",
            "min_samples_split" : 7,
            "min_samples_leaf" : 2,
            "class_weight" : "balanced"   
        },
        "LOG_REG_PARAMETERS" : {
            "penalty" : "l1",
            "C" : 4.281332398719396,
            "solver" : "liblinear",
            "max_iter": 500
        },
        "SVM_PARAMETERS" : {
            "kernel" : "poly",
            "C" : 4.281332398719396,
            "degree" : 1,
            "gamma" : "scale",
            "cache_size" : 100,
            "class_weight" : "balanced"
        },
        "KNN_PARAMETERS" : {
            "n_neighbors": 20,
            "weights" : "distance",
            "algorithm" : "auto",
            "leaf_size" : 10,
            "p" : 2,
            "n_jobs" : -1
        },
         "XGB_PARAMETERS" : {
            "learning_rate" : 0.1,
            "max_depth" : 3,
            "min_child_weight" : 2,
            "n_estimators" : 150,
            "nthread" : 1,
            "objective" : "multi:softmax", #"multi:softmax", # use "binary:logistic" for binary tasks, "multi:softmax" for multiclass
            "gamma" : 0.1,
            "subsample" : 0.3,
            "colsample_bytree" : 0.2,
            "eval_metric" : "mlogloss",
            "use_label_encoder" : False
        } 
    },
    
    "trial-2-hybrid": {
        "DECISION_TREE_PARAMETERS": {
            "criterion" : "entropy",
            "splitter" : "best",
            "max_depth" : 5,
            "max_features" : "auto",
            "min_samples_split" : 2,
            "min_samples_leaf" : 8,
            "class_weight" : "balanced"   
        },
        "LOG_REG_PARAMETERS" : {
            "penalty" : "l2",
            "C" : 4.281332398719396,
            "solver" : "liblinear",
            "max_iter": 500
        },
        "SVM_PARAMETERS" : {
            "kernel" : "sigmoid",
            "C" : 1.623776739188721,
            "degree" : 0,
            "gamma" : "scale",
            "cache_size" : 100,
            "class_weight" : "balanced"
        },
        "KNN_PARAMETERS" : {
            "n_neighbors": 15,
            "weights" : "distance",
            "algorithm" : "ball_tree",
            "leaf_size" : 10,
            "p" : 2,
            "n_jobs" : -1
        },
         "XGB_PARAMETERS" : {
            "learning_rate" : 0.1,
            "max_depth" : 2,
            "min_child_weight" : 2,
            "n_estimators" : 200,
            "nthread" : 1,
            "objective" : "multi:softmax", #"multi:softmax", # use "binary:logistic" for binary tasks, "multi:softmax" for multiclass
            "gamma" : 0.2,
            "subsample" : 0.3,
            "colsample_bytree" : 0.2,
            "eval_metric" : "mlogloss",
            "use_label_encoder" : False
        }
    },
    
    "trial-3-uni": {
        "DECISION_TREE_PARAMETERS": {
            "criterion" : "gini",
            "splitter" : "best",
            "max_depth" : 10,
            "max_features" : "auto",
            "min_samples_split" : 9,
            "min_samples_leaf" : 4,
            "class_weight" : "balanced"   
        },
        "LOG_REG_PARAMETERS" : {
            "penalty" : "l2",
            "C" : 4.281332398719396,
            "solver" : "liblinear",
            "max_iter": 500
        },
        "SVM_PARAMETERS" : {
            "kernel" : "rbf",
            "C" : 4.281332398719396,
            "degree" : 0,
            "gamma" : "scale",
            "cache_size" : 100,
            "class_weight" : "balanced"
        },
        "KNN_PARAMETERS" : {
            "n_neighbors": 15,
            "weights" : "distance",
            "algorithm" : "ball_tree",
            "leaf_size" : 10,
            "p" : 2,
            "n_jobs" : -1
        },
         "XGB_PARAMETERS" : {
            "learning_rate" : 0.1,
            "max_depth" : 5,
            "min_child_weight" : 2,
            "n_estimators" : 150,
            "nthread" : 1,
            "objective" : "multi:softmax", #"multi:softmax", # use "binary:logistic" for binary tasks, "multi:softmax" for multiclass
            "gamma" : 0.2,
            "subsample" : 0.3,
            "colsample_bytree" : 0.3,
            "eval_metric" : "mlogloss",
            "use_label_encoder" : False
        }
    },
    
    "trial-3-hybrid": {
        "DECISION_TREE_PARAMETERS": {
            "criterion" : "gini",
            "splitter" : "best",
            "max_depth" : 20,
            "max_features" : "sqrt",
            "min_samples_split" : 9,
            "min_samples_leaf" : 2,
            "class_weight" : "balanced"   
        },
        "LOG_REG_PARAMETERS" : {
            "penalty" : "l2",
            "C" : 11.288378916846883,
            "solver" : "liblinear",
            "max_iter": 500
        },
        "SVM_PARAMETERS" : {
            "kernel" : "rbf",
            "C" : 1438.44988828766,
            "degree" : 0,
            "gamma" : "auto",
            "cache_size" : 100,
            "class_weight" : "balanced"
        },
        "KNN_PARAMETERS" : {
            "n_neighbors": 15,
            "weights" : "distance",
            "algorithm" : "auto",
            "leaf_size" : 10,
            "p" : 2,
            "n_jobs" : -1
        },
         "XGB_PARAMETERS" : {
            "learning_rate" : 0.1,
            "max_depth" : 5,
            "min_child_weight" : 2,
            "n_estimators" : 150,
            "nthread" : 1,
            "objective" : "multi:softmax", #"multi:softmax", # use "binary:logistic" for binary tasks, "multi:softmax" for multiclass
            "gamma" : 0.2,
            "subsample" : 0.3,
            "colsample_bytree" : 0.3,
            "eval_metric" : "mlogloss",
            "use_label_encoder" : False
        }
    }      
}

In [None]:
# ---------------------------------------------------------------------------- #
# -------------------------- GENERAL USE FUNCTIONS --------------------------- #
# ---------------------------------------------------------------------------- #

def timer(method):
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        x = method(*args, **kwargs)
        end_time = time.perf_counter()
        time_elapsed = end_time - start_time
        print("Run time (seconds): ", time_elapsed)
        return x
    return wrapper


In [None]:
# ---------------------------------------------------------------------------- #
# -------------------------- EXTRACTION FUNCTIONS ---------------------------- #
# ---------------------------------------------------------------------------- #

def xml_extraction(data_directory:str, concat_class: dict):
    """
    Loops through all annotated xml documents to collect values for baby
    and note numbers, annotated classification, and text content. Creates two
    dictionaries, both using the same keys.

    Parameters:
    data_directory (str): A string denoting the path where xml files are
    located.

    concat_class (dict): Used to concatenate the data of similar classifiers
    into a single classifier.

    Returns:
    class_dict (dict): A dictionary with keys that are tuples containing
    patient ID and note number, and values that are the manually annotated
    classification of the respective clinical note in string format.

    text_dict (dict): A dictionary with keys that are tuples containing
    patient ID and note number, and values that are the contents of the
    clinical notes themselves in string format.
    """

    class_dict = {}
    text_dict = {}

    for root, dirs, files in os.walk(data_directory):
        for file in sorted(files):
            if file.endswith('.xml'):
                etree = et.parse(os.path.join(root, file))
                baby_note = etree.findall('document//passage')[-1].find("text").text
                baby_note = tuple(re.findall('[\d]+', baby_note))
                text_content = etree.findall('document//passage')[0].find("text").text

                if baby_note not in class_dict.keys():
                    class_dict[baby_note] = set()
                if baby_note not in text_dict.keys():
                    text_dict[baby_note] = text_content

                annotations = etree.findall('document//annotation')

                for annotation in annotations:
                    annotation_type = annotation.find("infon[@key='type']").text
                    annotation_text = annotation.find("text").text
                    if annotation_type == 'FEED_CLASS':
                        class_dict[baby_note].add(annotation_text)

    class_dict = {key:element for key, value in class_dict.items() if len(value) == 1 for element in value}
    
    # Probably should return here and create new function with lower code
    # can be put in "if concat_class:" block
    
    for k, v in concat_class.items():
        for elem in v:
            class_dict = {key:( k if value == elem else value) for key, value in class_dict.items()}
    
    class_dict = {key: value for key, value in class_dict.items() if value in config["DETECTABLE_CLASSES"] or value in config["CONCAT_CLASS"]}

    return class_dict, text_dict


def structure_dataframe(class_dict:dict, text_dict:dict):
    """
    Takes two dictionaries that share keys, but with different values, and
    creates a pandas dataframe. Keys are set as the dataframe's index.

    Parameters:
    class_dict (dict): dictionary that containes key:value with a tuple of
    patient ID and note number as the key and the manually annotated
    classification, as a string, for the respective note as the value.

    text_dict (dict): dictionary that containes key:value with a tuple of
    patient ID and note number as the key and the contents of the note in string
    format as the value.

    Returns:
    dataframe (pandas.DataFrame): A dataframe that combines the two dictionaries.
    """
    
    data = [class_dict, text_dict]
    data = {key:[d[key] for d in data] for key in data[0]}
    dataframe = pd.DataFrame.from_dict(data,
                                orient='index',
                                columns=['--classification--', 'content'])
    dataframe.index.name='id_note'
    return dataframe

In [None]:
# ---------------------------------------------------------------------------- #
# ----------------------------- NLP FUNCTIONS -------------------------------- #
# ---------------------------------------------------------------------------- #

def text_preprocessing(corpus):
    """
    Alters a list of document-length strings, modifying each string to remove
    redacted tokens, irregular whitespace, punctuation, English stop words, and
    tokens that are composed solely of numbers. All tokens are adjusted to be
    lowercased.

    Parameters:
    corpus (list): A list of document-length strings

    Returns:
    corpus (list): A list of nested lists containing the strings of individual
    tokens.
    """
    
    stop_words = set(stopwords.words('english'))

    for i, note in enumerate(corpus):
        note = [word for word in note.split(' ') if not re.match("(\[[\*]+[\w]+[\*]+\])", word)]
        note = ' '.join(note)
        note = re.sub(r'[^\w\s]', '', note)
        note = [word for word in note.split(' ') if not re.match("([\*]+)", word)]
        note = [word for word in note if word != '']
        note = [word.lower() for word in note]
        note = [word for word in note if word not in stop_words]
        corpus[i] = [word for word in note if not re.match("([0-9]+)", word)]

    return corpus


def split_dataframe(dataframe, detectable_classes: list):
    """
    Divides a single pandas dataframe into a list of dataframes, segregated by
    manually annotated response variables in string format.

    Parameters:
    dataframe (pandas.DataFrame): A dataframe containing a column of
    string-represented classifiers for the data.

    concat_class (bool): Used to concatenate the data of two similar classifiers
    into a single classifier.

    Returns:
    dataframe_list (list): A list containing dataframes separated by
    classification
    """
    
    dataframe_list = []
    for elem in dataframe["--classification--"].unique():
        df = pd.DataFrame([row for row in dataframe.itertuples() if row[1] == elem], 
                          columns=["Index", "--classification--", "content"])
        dataframe_list.append(df) 
        
    return dataframe_list


In [None]:
# ---------------------------------------------------------------------------- #
# ----------------------- FEATURE CREATION FUNCTIONS ------------------------- #
# ---------------------------------------------------------------------------- #

def term_matrix(dataframe, corpus, ngram_range, matrix_type):
    """
    """
    for i, note in enumerate(corpus):
        corpus[i] = ' '.join(note)

    dataframe.drop(dataframe.columns[1], axis=1, inplace=True)
    dataframe['content'] = corpus
    
    if matrix_type == "tf-idf":
        vectorizer = TfidfVectorizer(analyzer='word', ngram_range=ngram_range, min_df=config["MIN_DOC_FREQ"])
    elif matrix_type == "count":
        vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, min_df=config["MIN_DOC_FREQ"])
    else:
        pass
        # need to put a proper RaiseError here
        # Something like 'config["MATRIX_TYPE"] is not a valid argument' then end run

    X = vectorizer.fit_transform(dataframe['content'])
    
    bow_dataframe = pd.DataFrame(X.toarray(), 
                                 columns=vectorizer.get_feature_names_out(),
                                 index=dataframe.index)
    
    bow_dataframe['--classification--'] = dataframe['--classification--']
    
    return bow_dataframe
                                     

def shared_term_matrix(dataframe, corpus, ngram_range, matrix_type):
    """
    """
                                     
    for i, note in enumerate(corpus):
        corpus[i] = ' '.join(note)

    dataframe.drop(dataframe.columns[1], axis=1, inplace=True)
    dataframe['content'] = corpus
    
    separated_dataframe = split_dataframe(dataframe, 
                                          detectable_classes=config["DETECTABLE_CLASSES"])

    if matrix_type == "tf-idf":
        vectorizer = TfidfVectorizer(analyzer='word', ngram_range=ngram_range, min_df=config["MIN_DOC_FREQ"])
    elif matrix_type == "count":
        vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, min_df=config["MIN_DOC_FREQ"])
    else:
        pass
        # need to put a proper RaiseError here
        # Something like 'config["MATRIX_TYPE"] is not a valid argument' then end run
    
    bow_dataframe_list = []
    for df in separated_dataframe:
        X = vectorizer.fit_transform(df['content'])
        bow_dataframe = pd.DataFrame(X.toarray(), 
                                     columns=vectorizer.get_feature_names_out(),
                                     index=df.index)
        
        bow_dataframe['--classification--'] = df['--classification--']
        bow_dataframe_list.append(bow_dataframe)
    
    features = bow_dataframe_list[0].columns
    
    for i in range(len(bow_dataframe_list)):
        features = np.intersect1d(features, bow_dataframe_list[i].columns)
    features = features.tolist()
    
    output_dataframe = bow_dataframe_list[0][features]
    
    for i in range(len(bow_dataframe_list)-1):
        output_dataframe = pd.merge(output_dataframe[features], bow_dataframe_list[i+1][features], how='outer')
        
    return output_dataframe
                
def unique_term_matrix(dataframe, corpus, ngram_range, concat_class):
    """
    """
    ngram_features = ngram_creation(corpus, ngram_range=ngram_range)

    dataframe.drop(dataframe.columns[1], axis=1, inplace=True)

    for i in range(len(ngram_features)):
        dataframe[str(i+1)+'-gram'] = ngram_features[i]


    separated_dataframe = split_dataframe(dataframe, concat_class=concat_class)

    vocab_list = token_refinement(separated_dataframe, ngram_range=ngram_range)
    vocab_list = unique_elements(vocab_list)
    vocab_list = merge_vocab(vocab_list, ngram_range=ngram_range)

    bow_dataframe = bag_of_words(dataframe, vocab_list, tf_idf=config["TF_IDF"])
    
    return bow_dataframe

In [None]:
# ---------------------------------------------------------------------------- #
# ----------------------- MACHINE LEARNING FUNCTIONS ------------------------- #
# ---------------------------------------------------------------------------- #
# Need to set this tuning function up to have an if statement for each possible model
# For XGB, need to map the classes to the note_dataframe["--classifications--"].values()
# or some variation of that.



def hyperparameter_tuning(X, y, model, param_grid):
#     f1 = make_scorer(f1_score , average='weighted')
    int_encoder = {value: i for i, value in enumerate(y.unique().tolist())}
    
    y = y.replace(y.unique().tolist(), list(range(len(y.unique()))))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"])
    
    gscv = GridSearchCV(model, 
                        param_grid=param_grid,
                        scoring="accuracy",
                        cv=config["CROSS_VALIDATIONS"],
                        verbose=True,
                        n_jobs=-1,
                        error_score="raise")
    
    gscv.fit(X_train, y_train)

    print(gscv.best_params_)
    print(gscv.best_score_)
    print(gscv.best_estimator_)
    

def feature_selection(X, y, model):
    
    int_encoder = {value: i for i, value in enumerate(y.unique().tolist())}
    
    y = y.replace(y.unique().tolist(), list(range(len(y.unique()))))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"])
    
    model.fit(X_train, y_train)
    
    columns = []
    feature_importance = []
    
    for i, column in enumerate(X_train):
        columns.append(column)
        feature_importance.append(model.feature_importances_[i])
    
    feature_dataframe = zip(columns, feature_importance)
    feature_dataframe = pd.DataFrame(feature_dataframe, columns=["features", "feature_importance"])
    
    feature_dataframe = feature_dataframe.sort_values("feature_importance", ascending=False)

    for row in feature_dataframe.itertuples():
        
#         print(type(row[1])) #string
#         print(type(row[2])) #float
        if row[2] > 0:
            print("Feature: {} \t Importance: {}".format(row[1], row[2]))


def model_decision_tree(X, y):
    
    model = DecisionTreeClassifier(criterion=model_hyperparameters["trial-1-uni"]["DECISION_TREE_PARAMETERS"]["criterion"],
                                   splitter=model_hyperparameters["trial-1-uni"]["DECISION_TREE_PARAMETERS"]["splitter"],
                                   max_depth=model_hyperparameters["trial-1-uni"]["DECISION_TREE_PARAMETERS"]["max_depth"],
                                   max_features=model_hyperparameters["trial-1-uni"]["DECISION_TREE_PARAMETERS"]["max_features"],
                                   min_samples_split=model_hyperparameters["trial-1-uni"]["DECISION_TREE_PARAMETERS"]["min_samples_split"],
                                   min_samples_leaf=model_hyperparameters["trial-1-uni"]["DECISION_TREE_PARAMETERS"]["min_samples_leaf"],
                                   class_weight=model_hyperparameters["trial-1-uni"]["DECISION_TREE_PARAMETERS"]["class_weight"],
                                   random_state=config["RANDOM_STATE"])
    
#     model = DecisionTreeClassifier(criterion=model_hyperparameters["trial-1-hybrid"]["DECISION_TREE_PARAMETERS"]["criterion"],
#                                    splitter=model_hyperparameters["trial-1-hybrid"]["DECISION_TREE_PARAMETERS"]["splitter"],
#                                    max_depth=model_hyperparameters["trial-1-hybrid"]["DECISION_TREE_PARAMETERS"]["max_depth"],
#                                    max_features=model_hyperparameters["trial-1-hybrid"]["DECISION_TREE_PARAMETERS"]["max_features"],
#                                    min_samples_split=model_hyperparameters["trial-1-hybrid"]["DECISION_TREE_PARAMETERS"]["min_samples_split"],
#                                    min_samples_leaf=model_hyperparameters["trial-1-hybrid"]["DECISION_TREE_PARAMETERS"]["min_samples_leaf"],
#                                    class_weight=model_hyperparameters["trial-1-hybrid"]["DECISION_TREE_PARAMETERS"]["class_weight"],
#                                    random_state=config["RANDOM_STATE"])
    
#     model = DecisionTreeClassifier(criterion=config["model_hyperparameters"]["DECISION_TREE_PARAMETERS"]["criterion"],
#                                    splitter=config["model_hyperparameters"]["DECISION_TREE_PARAMETERS"]["splitter"],
#                                    max_depth=config["model_hyperparameters"]["DECISION_TREE_PARAMETERS"]["max_depth"],
#                                    max_features=config["model_hyperparameters"]["DECISION_TREE_PARAMETERS"]["max_features"],
#                                    min_samples_split=config["model_hyperparameters"]["DECISION_TREE_PARAMETERS"]["min_samples_split"],
#                                    min_samples_leaf=config["model_hyperparameters"]["DECISION_TREE_PARAMETERS"]["min_samples_leaf"],
#                                    class_weight=config["model_hyperparameters"]["DECISION_TREE_PARAMETERS"]["class_weight"],
#                                    random_state=config["RANDOM_STATE"])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"])

    model.fit(X_train, y_train)
    
    report = classification_report(y_test.tolist(), model.predict(X_test).tolist(), output_dict=True)
    
    report_dataframe = pd.DataFrame(report).transpose()
    
    model_save_file = os.path.join(config["directories"]["MLMODEL_PICKLE_OUTPUT_DIR"], 
                                   config["RUN_DATETIME"] + "-" + str(type(model).__name__) + ".pkl")
    
    csv_save_file = os.path.join(config["directories"]["DESC_STATS_OUTPUT_DIR"],
                                 config['RUN_DATETIME'] + "-" + str(type(model).__name__) + "-stats.csv")
    
    pickle.dump(model, open(model_save_file, "wb"))
    report_dataframe.to_csv(csv_save_file)
    
    
    
    
    
def model_logistic_regression(X, y):
    
    model = LogisticRegression(penalty=model_hyperparameters["trial-1-uni"]["LOG_REG_PARAMETERS"]["penalty"],
                               C=model_hyperparameters["trial-1-uni"]["LOG_REG_PARAMETERS"]["C"],
                               solver=model_hyperparameters["trial-1-uni"]["LOG_REG_PARAMETERS"]["solver"],
                               max_iter=model_hyperparameters["trial-1-uni"]["LOG_REG_PARAMETERS"]["max_iter"],
                               random_state=config["RANDOM_STATE"])
    
#     model = LogisticRegression(penalty=model_hyperparameters["trial-1-hybrid"]["LOG_REG_PARAMETERS"]["penalty"],
#                                C=model_hyperparameters["trial-1-hybrid"]["LOG_REG_PARAMETERS"]["C"],
#                                solver=model_hyperparameters["trial-1-hybrid"]["LOG_REG_PARAMETERS"]["solver"],
#                                max_iter=model_hyperparameters["trial-1-hybrid"]["LOG_REG_PARAMETERS"]["max_iter"],
#                                random_state=config["RANDOM_STATE"])
    
#     model = LogisticRegression(penalty=config["model_hyperparameters"]["LOG_REG_PARAMETERS"]["penalty"],
#                                C=config["model_hyperparameters"]["LOG_REG_PARAMETERS"]["C"],
#                                solver=config["model_hyperparameters"]["LOG_REG_PARAMETERS"]["solver"],
#                                max_iter=config["model_hyperparameters"]["LOG_REG_PARAMETERS"]["max_iter"],
#                                random_state=config["RANDOM_STATE"])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"])

    model.fit(X_train, y_train)
    
    report = classification_report(y_test.tolist(), model.predict(X_test).tolist(), output_dict=True)
    
    report_dataframe = pd.DataFrame(report).transpose()
    
    model_save_file = os.path.join(config["directories"]["MLMODEL_PICKLE_OUTPUT_DIR"], 
                                   config["RUN_DATETIME"] + "-" + str(type(model).__name__) + ".pkl")
    
    csv_save_file = os.path.join(config["directories"]["DESC_STATS_OUTPUT_DIR"],
                                 config['RUN_DATETIME'] + "-" + str(type(model).__name__) + "-stats.csv")
    
    pickle.dump(model, open(model_save_file, "wb"))
    report_dataframe.to_csv(csv_save_file)
    
    
def model_svm (X, y):
    """
    """
    model = SVC(kernel=model_hyperparameters["trial-1-uni"]["SVM_PARAMETERS"]["kernel"],
                C=model_hyperparameters["trial-1-uni"]["SVM_PARAMETERS"]["C"],
                degree=model_hyperparameters["trial-1-uni"]["SVM_PARAMETERS"]["degree"],
                gamma=model_hyperparameters["trial-1-uni"]["SVM_PARAMETERS"]["gamma"],
                cache_size=model_hyperparameters["trial-1-uni"]["SVM_PARAMETERS"]["cache_size"],
                class_weight=model_hyperparameters["trial-1-uni"]["SVM_PARAMETERS"]["class_weight"],
                random_state=config["RANDOM_STATE"])
    
#     model = SVC(kernel=model_hyperparameters["trial-1-hybrid"]["SVM_PARAMETERS"]["kernel"],
#                 C=model_hyperparameters["trial-1-hybrid"]["SVM_PARAMETERS"]["C"],
#                 degree=model_hyperparameters["trial-1-hybrid"]["SVM_PARAMETERS"]["degree"],
#                 gamma=model_hyperparameters["trial-1-hybrid"]["SVM_PARAMETERS"]["gamma"],
#                 cache_size=model_hyperparameters["trial-1-hybrid"]["SVM_PARAMETERS"]["cache_size"],
#                 class_weight=model_hyperparameters["trial-1-hybrid"]["SVM_PARAMETERS"]["class_weight"],
#                 random_state=config["RANDOM_STATE"])
    
#     model = SVC(kernel=config["model_hyperparameters"]["SVM_PARAMETERS"]["kernel"],
#                 C=config["model_hyperparameters"]["SVM_PARAMETERS"]["C"],
#                 degree=config["model_hyperparameters"]["SVM_PARAMETERS"]["degree"],
#                 gamma=config["model_hyperparameters"]["SVM_PARAMETERS"]["gamma"],
#                 cache_size=config["model_hyperparameters"]["SVM_PARAMETERS"]["cache_size"],
#                 class_weight=config["model_hyperparameters"]["SVM_PARAMETERS"]["class_weight"],
#                 random_state=config["RANDOM_STATE"])
    
    X = preprocessing.scale(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"])
    
    model.fit(X_train, y_train)
    
    report = classification_report(y_test.tolist(), model.predict(X_test).tolist(), output_dict=True)
    
    report_dataframe = pd.DataFrame(report).transpose()
    
    model_save_file = os.path.join(config["directories"]["MLMODEL_PICKLE_OUTPUT_DIR"], 
                                   config["RUN_DATETIME"] + "-" + str(type(model).__name__) + ".pkl")
    
    csv_save_file = os.path.join(config["directories"]["DESC_STATS_OUTPUT_DIR"],
                                 config['RUN_DATETIME'] + "-" + str(type(model).__name__) + "-stats.csv")
    
    pickle.dump(model, open(model_save_file, "wb"))
    report_dataframe.to_csv(csv_save_file)

def model_knn(X, y):
    """
    """
    model = KNeighborsClassifier(n_neighbors=model_hyperparameters["trial-1-uni"]["KNN_PARAMETERS"]["n_neighbors"],
                                 weights=model_hyperparameters["trial-1-uni"]["KNN_PARAMETERS"]["weights"],
                                 algorithm=model_hyperparameters["trial-1-uni"]["KNN_PARAMETERS"]["algorithm"],
                                 leaf_size=model_hyperparameters["trial-1-uni"]["KNN_PARAMETERS"]["leaf_size"],
                                 p=model_hyperparameters["trial-1-uni"]["KNN_PARAMETERS"]["p"],
                                 n_jobs=model_hyperparameters["trial-1-uni"]["KNN_PARAMETERS"]["n_jobs"])
    
#     model = KNeighborsClassifier(n_neighbors=model_hyperparameters["trial-1-hybrid"]["KNN_PARAMETERS"]["n_neighbors"],
#                                  weights=model_hyperparameters["trial-1-hybrid"]["KNN_PARAMETERS"]["weights"],
#                                  algorithm=model_hyperparameters["trial-1-hybrid"]["KNN_PARAMETERS"]["algorithm"],
#                                  leaf_size=model_hyperparameters["trial-1-hybrid"]["KNN_PARAMETERS"]["leaf_size"],
#                                  p=model_hyperparameters["trial-1-hybrid"]["KNN_PARAMETERS"]["p"],
#                                  n_jobs=model_hyperparameters["trial-1-hybrid"]["KNN_PARAMETERS"]["n_jobs"])
    
#     model = KNeighborsClassifier(n_neighbors=config["model-hyperparameters"]["KNN_PARAMETERS"]["n_neighbors"],
#                                  weights=config["model-hyperparameters"]["KNN_PARAMETERS"]["weights"],
#                                  algorithm=config["model-hyperparameters"]["KNN_PARAMETERS"]["algorithm"],
#                                  leaf_size=config["model-hyperparameters"]["KNN_PARAMETERS"]["leaf_size"],
#                                  p=config["model-hyperparameters"]["KNN_PARAMETERS"]["p"],
#                                  n_jobs=config["model-hyperparameters"]["KNN_PARAMETERS"]["n_jobs"])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"])

    model.fit(X_train, y_train)
    
    report = classification_report(y_test.tolist(), model.predict(X_test).tolist(), output_dict=True)
    
    report_dataframe = pd.DataFrame(report).transpose()
    
    model_save_file = os.path.join(config["directories"]["MLMODEL_PICKLE_OUTPUT_DIR"], 
                                   config["RUN_DATETIME"] + "-" + str(type(model).__name__) + ".pkl")
    
    csv_save_file = os.path.join(config["directories"]["DESC_STATS_OUTPUT_DIR"],
                                 config['RUN_DATETIME'] + "-" + str(type(model).__name__) + "-stats.csv")
    
    pickle.dump(model, open(model_save_file, "wb"))
    report_dataframe.to_csv(csv_save_file)
    
def model_xgboost(X, y):
    """
    """
    model = XGBClassifier(learning_rate=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["learning_rate"],
                          max_depth=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["max_depth"],
                          min_child_weight=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["min_child_weight"],
                          n_estimators=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["n_estimators"],
                          nthread=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["nthread"],
                          objective=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["objective"],
                          gamma=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["gamma"],
                          subsample=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["subsample"],
                          colsample_bytree=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["colsample_bytree"],
                          eval_metric=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["eval_metric"],
                          use_label_encoder=model_hyperparameters["trial-1-uni"]["XGB_PARAMETERS"]["use_label_encoder"],
                          random_state=model_hyperparameters["RANDOM_STATE"])
    
#     model = XGBClassifier(learning_rate=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["learning_rate"],
#                           max_depth=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["max_depth"],
#                           min_child_weight=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["min_child_weight"],
#                           n_estimators=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["n_estimators"],
#                           nthread=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["nthread"],
#                           objective=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["objective"],
#                           gamma=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["gamma"],
#                           subsample=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["subsample"],
#                           colsample_bytree=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["colsample_bytree"],
#                           eval_metric=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["eval_metric"],
#                           use_label_encoder=model_hyperparameters["trial-1-hybrid"]["XGB_PARAMETERS"]["use_label_encoder"],
#                           random_state=model_hyperparameters["RANDOM_STATE"])   
    
#     model = XGBClassifier(learning_rate=config["model-hyperparameters"]["XGB_PARAMETERS"]["learning_rate"],
#                           max_depth=config["model-hyperparameters"]["XGB_PARAMETERS"]["max_depth"],
#                           min_child_weight=config["model-hyperparameters"]["XGB_PARAMETERS"]["min_child_weight"],
#                           n_estimators=config["model-hyperparameters"]["XGB_PARAMETERS"]["n_estimators"],
#                           nthread=config["model-hyperparameters"]["XGB_PARAMETERS"]["nthread"],
#                           objective=config["model-hyperparameters"]["XGB_PARAMETERS"]["objective"],
#                           gamma=config["model-hyperparameters"]["XGB_PARAMETERS"]["gamma"],
#                           subsample=config["model-hyperparameters"]["XGB_PARAMETERS"]["subsample"],
#                           colsample_bytree=config["model-hyperparameters"]["XGB_PARAMETERS"]["colsample_bytree"],
#                           eval_metric=config["model-hyperparameters"]["XGB_PARAMETERS"]["eval_metric"],
#                           use_label_encoder=config["model-hyperparameters"]["XGB_PARAMETERS"]["use_label_encoder"],
#                           random_state=config["RANDOM_STATE"])
    
    int_decoder = {str(i): value for i, value in enumerate(y.unique().tolist())}
    
    y = y.replace(y.unique().tolist(), list(range(len(y.unique()))))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"])
    
    model.fit(X_train, y_train)
    
    
    report = classification_report(y_test.tolist(), model.predict(X_test).tolist(), output_dict=True)
    report = {(int_decoder[key] if key in int_decoder else key):value for key, value in report.items()}
    
    report_dataframe = pd.DataFrame(report).transpose()
    
    model_save_file = os.path.join(config["directories"]["MLMODEL_PICKLE_OUTPUT_DIR"], 
                                   config["RUN_DATETIME"] + "-" + str(type(model).__name__) + ".pkl")
    
    csv_save_file = os.path.join(config["directories"]["DESC_STATS_OUTPUT_DIR"],
                                 config['RUN_DATETIME'] + "-" + str(type(model).__name__) + "-stats.csv")
    
    pickle.dump(model, open(model_save_file, "wb"))
    report_dataframe.to_csv(csv_save_file)
    
                                             

In [None]:
# ---------------------------------------------------------------------------- #
# ------------------------- EXTRACTION EXECUTABLES --------------------------- #
# ---------------------------------------------------------------------------- #
@timer
def extr_execute():
    """Executes the extraction process of the TaCLE package."""
#     with open("./../../config.json", 'r') as jsonfile:
#         config = json.load(jsonfile)

    class_dict, text_dict = xml_extraction(config["directories"]["V0_DATA_IMPORT_DIR"], concat_class=config['CONCAT_CLASS'])

    note_dataframe = structure_dataframe(class_dict, text_dict)
    
    save_file = Path(str(config['directories']['EXTR_PICKLE_OUTPUT_DIR']) + ('/' + config['RUN_DATETIME'] + '-extr.pkl'))
    note_dataframe.to_pickle(save_file)

In [None]:
if __name__ == "__main__":
    extr_execute()

In [None]:
# ---------------------------------------------------------------------------- #
# ----------------------------- NLP EXECUTABLES ------------------------------ #
# ---------------------------------------------------------------------------- #

@timer
def nlp_execute(refinement: str="none"):
    """Executes the natural language processing tasks of the TaCLE package."""

#     with open("./../config.json", 'r') as jsonfile:
#         config = json.load(jsonfile)

#     nltk.download('stopwords')

    load_file = Path(str(config['directories']['EXTR_PICKLE_OUTPUT_DIR']) + ('/' + config['RUN_DATETIME'] + '-extr.pkl'))

    note_dataframe = pd.read_pickle(load_file)

    note_corpus = note_dataframe['content'].tolist()
    note_corpus = text_preprocessing(note_corpus)

    print(note_dataframe['--classification--'].value_counts())
   
    
    if refinement == "none":
        bow_dataframe = term_matrix(note_dataframe, 
                                    note_corpus, 
                                    ngram_range=config["NGRAM"],
                                    matrix_type=config["MATRIX_TYPE"])

    elif refinement == "shared":
        bow_dataframe = shared_term_matrix(note_dataframe,
                                           note_corpus,
                                           ngram_range=config["NGRAM"],
                                           matrix_type=config["MATRIX_TYPE"])
    
    elif refinement == "unique":
        bow_dataframe = unique_term_matrix(note_dataframe, 
                                           note_corpus, 
                                           ngram_range=config["NGRAM"],
                                           matrix_type=config["MATRIX_TYPE"])
        
    save_file = Path(str(config["directories"]["NLP_PICKLE_OUTPUT_DIR"]) + ('/' + config["RUN_DATETIME"] + '-nlp.pkl'))
    bow_dataframe.to_pickle(save_file)
    

In [None]:
if __name__ == "__main__":
    nlp_execute(refinement=config["REFINEMENT"])

In [None]:
# ---------------------------------------------------------------------------- #
# ------------------------- MODEL TUNING EXECUTABLES ------------------------- #
# ---------------------------------------------------------------------------- #
@timer
def ml_tuning(model, matrix_type: str):
    
    load_file = Path(str(config['directories']['NLP_PICKLE_OUTPUT_DIR']) + ('/' + config['RUN_DATETIME'] + '-nlp.pkl'))

    bow_dataframe = pd.read_pickle(load_file)
    
    X = bow_dataframe.iloc[:, 0:bow_dataframe.shape[1]-1]
    y = bow_dataframe.iloc[:, bow_dataframe.shape[1]-1]
    
    hyperparameter_tuning(X, y, model, matrix_type)


In [None]:
# ---------------------------------------------------------------------------- #
# ----------------------- MACHINE LEARNING EXECUTABLES ----------------------- #
# ---------------------------------------------------------------------------- #
@timer
def ml_execute(matrix_type: str):
    """Executes the machine learning tasks of the TaCLE package."""
    
#             with open("./../config.json", 'r') as jsonfile:
#                 config = json.load(jsonfile)

    load_file = Path(str(config['directories']['NLP_PICKLE_OUTPUT_DIR']) + ('/' + config['RUN_DATETIME'] + '-nlp.pkl'))

    bow_dataframe = pd.read_pickle(load_file)
    
    X = bow_dataframe.iloc[:, 0:bow_dataframe.shape[1]-1]
    y = bow_dataframe.iloc[:, bow_dataframe.shape[1]-1]
    
    model_decision_tree(X, y)
    model_knn(X, y)
    model_logistic_regression(X, y)
    model_svm(X, y)
    model_xgboost(X, y)
    
    
    
    
#     y = pd.get_dummies(y)
    
#     for i in range(1, 6):
#         model_decision_tree(X, y)
#         print(' ---------- ')
    
#     feature_selection(X, y, model)

#     X_train, X_val, X_test, y_train, y_val, y_test = split_data(bow_dataframe, 
#                                                                 test_size=config["TEST_SIZE"], 
#                                                                 val_size=config["VALIDATION_SIZE"],
#                                                                 random_state=config["RANDOM_STATE"])

    
    
#     y_train_pred = model.predict_proba(X_train)[:, 1]
#     y_val_pred = model.predict_proba(X_test)[:, 1]
    
#     print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
#                                                         roc_auc_score(y_test, y_val_pred)))
   
    

In [None]:
if __name__ == "__main__":
    
        
    load_file = Path(str(config['directories']['NLP_PICKLE_OUTPUT_DIR']) + ('/' + config['RUN_DATETIME'] + '-nlp.pkl'))

    bow_dataframe = pd.read_pickle(load_file)
    
    X = bow_dataframe.iloc[:, 0:bow_dataframe.shape[1]-1]
    y = bow_dataframe.iloc[:, bow_dataframe.shape[1]-1]
    
    print("DecisionTree")
    hyperparameter_tuning(X, y, model=DecisionTreeClassifier(), param_grid=config["parameter-tuning"]["DECISION_TREE_TUNING"])
    print("----------\n")
        
    print("KNeighbors")
    hyperparameter_tuning(X, y, model=KNeighborsClassifier(), param_grid=config["parameter-tuning"]["KNN_TUNING"])
    print("----------\n")
        
    print("LogisticRegression")
    hyperparameter_tuning(X, y, model=LogisticRegression(), param_grid=config["parameter-tuning"]["LOG_REG_TUNING"])
    print("----------\n")
        
    print("SVC")
    hyperparameter_tuning(X, y, model=SVC(), param_grid=config["parameter-tuning"]["SVM_TUNING"])
    print("----------\n")
    
    print("XGBoost")
    hyperparameter_tuning(X, y, model=XGBClassifier(use_label_encoder=False), param_grid=config["parameter-tuning"]["XGB_TUNING"])
    print("----------\n")
    

In [None]:
if __name__ == "__main__":
    ml_execute(matrix_type=config["MATRIX_TYPE"])

In [None]:
# Binary (feeding, na), unigram

# DecisionTree
# Fitting 5 folds for each of 9504 candidates, totalling 47520 fits
# {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 8, 'min_samples_split': 14, 'splitter': 'best'}
# 0.8501298701298701

# KNeighbors
# Fitting 5 folds for each of 768 candidates, totalling 3840 fits
# {'algorithm': 'ball_tree', 'leaf_size': 10, 'n_jobs': -1, 'n_neighbors': 10, 'p': 2, 'weights': 'distance'}
# 0.8397653958944282

# LogisticRegression
# Fitting 5 folds for each of 560 candidates, totalling 2800 fits
# {'C': 11.288378916846883, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
# 0.8733724340175953

# SVC
# Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
# {'C': 4.281332398719396, 'cache_size': 100, 'class_weight': 'balanced', 'degree': 1, 'gamma': 'scale', 'kernel': 'poly'}
# 0.8720737327188939

# XGBoost
# Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
# {'colsample_bytree': 0.3, 'eval_metric': 'mlogloss', 'gamma': 0.2, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 1, 'objective': 'binary:logistic', 'subsample': 0.3, 'use_label_encoder': False}
# 0.8669208211143695

# ------------------------------
# Binary (feeding, na), hybrid-gram

# DecisionTree
# Fitting 5 folds for each of 9504 candidates, totalling 47520 fits
# {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 12, 'splitter': 'random'}
# 0.8462421449518225

# KNeighbors
# Fitting 5 folds for each of 768 candidates, totalling 3840 fits
# {'algorithm': 'auto', 'leaf_size': 10, 'n_jobs': -1, 'n_neighbors': 10, 'p': 2, 'weights': 'distance'}
# 0.8462337662337662

# LogisticRegression
# Fitting 5 folds for each of 560 candidates, totalling 2800 fits
# {'C': 11.288378916846883, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
# 0.8682111436950146

# SVC
# Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
# {'C': 1.623776739188721, 'cache_size': 100, 'class_weight': 'balanced', 'degree': 0, 'gamma': 'scale', 'kernel': 'sigmoid'}
# 0.8746627565982404

# XGBoost
# Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
# {'colsample_bytree': 0.2, 'eval_metric': 'mlogloss', 'gamma': 0.3, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 1, 'objective': 'binary:logistic', 'subsample': 0.3, 'use_label_encoder': False}
# 0.8656388772517805

# ------------------------------
# Multiclass (breast, bottle, na), unigram

# DecisionTree
# Fitting 5 folds for each of 9504 candidates, totalling 47520 fits
# {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 7, 'splitter': 'best'}
# 0.8023460410557185

# KNearestNeighbors
# Fitting 5 folds for each of 768 candidates, totalling 3840 fits
# {'algorithm': 'auto', 'leaf_size': 10, 'n_jobs': -1, 'n_neighbors': 20, 'p': 2, 'weights': 'distance'}
# 0.7764893171344784

# LogisticRegression
# Fitting 5 folds for each of 560 candidates, totalling 2800 fits
# {'C': 4.281332398719396, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
# 0.8500879765395893

# SVC
# Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
# {'C': 4.281332398719396, 'cache_size': 100, 'class_weight': 'balanced', 'degree': 1, 'gamma': 'scale', 'kernel': 'poly'}
# 0.8333305404273146

# XGBoost
# Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
# {'colsample_bytree': 0.2, 'eval_metric': 'mlogloss', 'gamma': 0.1, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 150, 'nthread': 1, 'objective': 'multi:softmax', 'subsample': 0.3, 'use_label_encoder': False}
# 0.8669208211143694

# ------------------------------
#Multiclass (breast, bottle, na), hybrid-gram

# DecisionTree
# Fitting 5 folds for each of 9504 candidates, totalling 47520 fits
# {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 8, 'min_samples_split': 2, 'splitter': 'best'}
# 0.7997235023041475

# KNeighbors
# Fitting 5 folds for each of 768 candidates, totalling 3840 fits
# {'algorithm': 'ball_tree', 'leaf_size': 10, 'n_jobs': -1, 'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
# 0.7958609132802681

# LogisticRegression
# Fitting 5 folds for each of 560 candidates, totalling 2800 fits
# {'C': 4.281332398719396, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
# 0.854017595307918

# SVC
# Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
# {'C': 1.623776739188721, 'cache_size': 100, 'class_weight': 'balanced', 'degree': 0, 'gamma': 'scale', 'kernel': 'sigmoid'}
# 0.8552911604524507

# XGBoost
# Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
# {'colsample_bytree': 0.2, 'eval_metric': 'mlogloss', 'gamma': 0.2, 'max_depth': 2, 'min_child_weight': 2, 'n_estimators': 200, 'nthread': 1, 'objective': 'multi:softmax', 'subsample': 0.3, 'use_label_encoder': False}
# 0.8720988688730623

# ------------------------------
#Multiclass (breast, bottle, express, na), unigram

# DecisionTree
# Fitting 5 folds for each of 9504 candidates, totalling 47520 fits
# {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 9, 'splitter': 'best'}
# 0.745429409300377

# KNearestNeighbors
# Fitting 5 folds for each of 768 candidates, totalling 3840 fits
# {'algorithm': 'auto', 'leaf_size': 10, 'n_jobs': -1, 'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
# 0.760980310012568

#LogisticRegression
# Fitting 5 folds for each of 560 candidates, totalling 2800 fits
# {'C': 1.623776739188721, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
# 0.8346041055718475

#SVC
# Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
# {'C': 4.281332398719396, 'cache_size': 100, 'class_weight': 'balanced', 'degree': 0, 'gamma': 'scale', 'kernel': 'rbf'}
# 0.8087892752408882

#XGBoost
# Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
# {'colsample_bytree': 0.2, 'eval_metric': 'mlogloss', 'gamma': 0.2, 'max_depth': 2, 'min_child_weight': 2, 'n_estimators': 150, 'nthread': 1, 'objective': 'multi:softmax', 'subsample': 0.3, 'use_label_encoder': False}
# 0.854025974025974

# ------------------------------
#Multiclass (breast, bottle, express, na), hybrid-gram

#DecisionTree
# Fitting 5 folds for each of 9504 candidates, totalling 47520 fits
# {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 9, 'splitter': 'best'}
# 0.7428906577293674

#KNearestNeighbors
# Fitting 5 folds for each of 768 candidates, totalling 3840 fits
# {'algorithm': 'auto', 'leaf_size': 10, 'n_jobs': -1, 'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
# 0.7842312526183495

#LogisticRegression
# Fitting 5 folds for each of 560 candidates, totalling 2800 fits
# {'C': 11.288378916846883, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
# 0.8385085881860075

#SVC
# Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
# {'C': 1438.44988828766, 'cache_size': 100, 'class_weight': 'balanced', 'degree': 0, 'gamma': 'auto', 'kernel': 'rbf'}
# 0.8243150397989109

#XGBoost
# Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
# {'colsample_bytree': 0.3, 'eval_metric': 'mlogloss', 'gamma': 0.2, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 150, 'nthread': 1, 'objective': 'multi:softmax', 'subsample': 0.3, 'use_label_encoder': False}
# 0.8514453288646837

# ------------------------------


In [None]:
#     model = LogisticRegression(penalty=config["machine-learning"]["LOG_REG_PARAMETERS"]["penalty"],
#                        C=config["machine-learning"]["LOG_REG_PARAMETERS"]["C"],
#                        solver=config["machine-learning"]["LOG_REG_PARAMETERS"]["solver"],
#                        max_iter=config["machine-learning"]["LOG_REG_PARAMETERS"]["max_iter"])
    
#     model = KNeighborsClassifier(n_neightbors=config["machine-learning"]["KNN_PARAMETERS"]["n_neighbors"],
#                                  weights=config["machine-learning"]["KNN_PARAMETERS"]["weights"],
#                                  algorithm=config["machine-learning"]["KNN_PARAMETERS"]["algorithm"],
#                                  leaf_size=config["machine-learning"]["KNN_PARAMETERS"]["leaf_size"],
#                                  p=config["machine-learning"]["KNN_PARAMETERS"]["p"],
#                                  n_jobs=config["machine-learning"]["KNN_PARAMETERS"]["n_jobs"])

#     model = DecisionTreeClassifier(criterion=config["machine-learning"]["DECISION_TREE_PARAMETERS"]["criterion"],
#                                    splitter=config["machine-learning"]["DECISION_TREE_PARAMETERS"]["splitter"],
#                                    max_depth=config["machine-learning"]["DECISION_TREE_PARAMETERS"]["max_depth"],
#                                    max_features=config["machine-learning"]["DECISION_TREE_PARAMETERS"]["max_features"],
#                                    min_samples_split=config["machine-learning"]["DECISION_TREE_PARAMETERS"]["min_samples_split"],
#                                    min_samples_leaf=config["machine-learning"]["DECISION_TREE_PARAMETERS"]["min_samples_leaf"],
#                                    class_weight=config["machine-learning"]["DECISION_TREE_PARAMETERS"]["class_weight"])
    
#     model = SVC(kernel=config["machine-learning"]["SVM_PARAMETERS"]["kernel"],
#             C=config["machine-learning"]["SVM_PARAMETERS"]["C"],
#             degree=config["machine-learning"]["SVM_PARAMETERS"]["degree"],
#             gamma=config["machine-learning"]["SVM_PARAMETERS"]["gamma"],
#             cache_size=config["machine-learning"]["SVM_PARAMETERS"]["cache_size"],
#             class_weight=config["machine-learning"]["SVM_PARAMETERS"]["class_weight"])

In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

In [None]:
# (968, 886)
# Fitting 10 folds for each of 9504 candidates, totalling 95040 fits
# {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'best'}
# 0.7895386251076795
# DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
#                        max_depth=15, max_features='auto', min_samples_leaf=2,
#                        min_samples_split=5)
#  ---------- 
# Fitting 10 folds for each of 9504 candidates, totalling 95040 fits
# {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'splitter': 'best'}
# 0.792258076511153
# DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
#                        max_depth=15, max_features='auto', min_samples_leaf=5)
#  ---------- 
# Fitting 10 folds for each of 9504 candidates, totalling 95040 fits
# {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 3, 'splitter': 'best'}
# 0.7936551796694611
# DecisionTreeClassifier(class_weight='balanced', max_depth=10,
#                        max_features='sqrt', min_samples_leaf=4,
#                        min_samples_split=3)
#  ---------- 
# Fitting 10 folds for each of 9504 candidates, totalling 95040 fits
# {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 9, 'splitter': 'best'}
# 0.7931437420045309
# DecisionTreeClassifier(class_weight='balanced', max_depth=15,
#                        max_features='sqrt', min_samples_leaf=2,
#                        min_samples_split=9)
#  ---------- 
# Fitting 10 folds for each of 9504 candidates, totalling 95040 fits
# {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 6, 'min_samples_split': 6, 'splitter': 'best'}
# 0.792533826878633
# DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
#                        max_depth=40, max_features='auto', min_samples_leaf=6,
#                        min_samples_split=6)
#  ---------- 
# Run time (seconds):  5859.5219454000035