## Load in data


In [6]:
import numpy as np
import pandas as pd
import sys
import json
from sklearn.preprocessing import LabelEncoder
from types import SimpleNamespace

# Initiate logged_results dictionary
logged_results = {}

# Load data
df = pd.read_csv("data/{}.csv".format(args['name_of_data_file']))

# Remove NaN
np_data = df.fillna(' ').values

# Remove unneccessary columns from dataframe
company_descriptions = np_data[0:args['data_limit'], 0]
sni_numbers_column = np_data[0:args['data_limit'], (1 if args['use_full_SNI_numbers'] else 3)]

# Split data into X and y
X_raw = company_descriptions

###

# The following nltk packages need to be installed, uncomment to install:
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
import nltk
import time

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re

# Initiate lemmatizer and stemmer used in NLP operations
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Applies NLP operations to the provided input data and returns a new input data object.
# Only need to be done once if the processed dataset is saved to a new CSV file and
# then loaded at the start of the script.
def ApplyNLPtoInputTextData(X, print_index_interval):

    # Takes a single input data string and applies NLP operations.
    def Preprocess(sentence, tokenizer):
        sentence=str(sentence)
        sentence = sentence.lower() # Converts all characters to lowercase
        cleanr = re.compile('<.*?>') # Removes special signs
        cleantext = re.sub(cleanr, '', sentence) 
        rem_num = re.sub('[0-9]+', '', cleantext) # Removes numbers
        tokens = tokenizer.tokenize(rem_num) # Tokenizes the string
        # Removes the general stopwords defined in nltk's Swedish stopwords list
        filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('swedish')]
        stem_words=[stemmer.stem(w) for w in filtered_words] # Applies stemming
        lemma_words=[lemmatizer.lemmatize(w) for w in stem_words] # Applies lemmatization
        return " ".join(filtered_words) # Joins the token array and returns it as a space-separated string

    print("Applying NLP to input data...")
    time_before_operation = time.perf_counter()
    X_NLP_applied = np.empty_like(X)
    tokenizer = RegexpTokenizer(r'\w+')
    # Iterates all rows in the input data and applies Preprocess
    for index, company_description in enumerate(X):
        X_NLP_applied[index] = Preprocess(company_description, tokenizer)
        if index % print_index_interval == print_index_interval - 1:
            total_time = time.perf_counter() - time_before_operation
            print("Reached index {} ({} seconds)".format(index, GetTimeOfOperation(time_before_operation)))
    print("Finished applying NLP to input data ({} seconds)".format(GetTimeOfOperation(time_before_operation)))
    return X_NLP_applied

# Saves a new CSV file where the input data column has been replaced (all other columns are
# copied from the original data).
def OutputCSVWithReplacedXColumn(np_data, X_raw):
    time_before_operation = time.perf_counter()
    cropped_np_data = np_data[0:len(X_raw)]
    cropped_np_data[:, 0] = X_raw #b[:, j]
    # Convert array into dataframe
    new_DF = pd.DataFrame(cropped_np_data)
    # Save the dataframe as a CSV file
    new_csv_name = "data/new_data.csv"
    new_DF.to_csv(new_csv_name, index=None)
    print("Finished output of CSV file '{}' ({} seconds)".format(new_csv_name, GetTimeOfOperation(time_before_operation)))

# Only applies NLP preprocessing and outputs a new CSV file if name_of_data_file is the
# original dataset (otherwise it assumes that the provided dataset has already been processed).
if args['name_of_data_file'] == "activitytext_sni":
    X_raw = ApplyNLPtoInputTextData(X_raw, 5000)
    OutputCSVWithReplacedXColumn(np_data, X_raw)

# Function used to output a list of all the most common words in the input data.
# Only used once to facilitate creating a custom stopwords list.
def OutputListOfMostCommonWords(X, top_range, print_index_interval):

    def OutputListToTXTFile(lines):
        with open('data/top_words_list.txt', 'w') as f:
            for line in lines:
                f.write(f"{line}\n")

    time_before_operation = time.perf_counter()
    word_dictionary = {}
    tokenizer = RegexpTokenizer(r'\w+')
    for index, row in enumerate(X):
        tokens = tokenizer.tokenize(row)
        for token in tokens:
            if token in word_dictionary:
                word_dictionary[token] = word_dictionary[token] + 1
            else:
                word_dictionary[token] = 0
        if index % print_index_interval == print_index_interval - 1:
            print("Reached index {} ({} seconds)".format(index, GetTimeOfOperation(time_before_operation)))
    top_list_for_prints = []
    top_list_for_file_output = []
    print("Ordering list of top words...)")
    for x in range(top_range):
        maxword = max(word_dictionary, key=word_dictionary.get)
        top_list_for_prints.append(maxword + ': ' + str(word_dictionary[maxword]))
        top_list_for_file_output.append(maxword)
        del word_dictionary[maxword]
    print(top_list_for_prints)
    OutputListToTXTFile(top_list_for_prints)
    print("Finished output of custom stopwords TXT file '{}' ({} seconds)".format('top_words_list', GetTimeOfOperation(time_before_operation)))

# Takes a custom stopwords list file and removes all words that are not prefixed with
# a hashtag from the input data.
def RemoveCustomStopwords(X, name_of_custom_stopwords_file, print_index_interval):

    def RemoveCustomStopwordsFromXRow(company_description, custom_stopwords_list, tokenizer):
        tokens = tokenizer.tokenize(company_description)
        filtered_words = [w for w in tokens if len(w) > 2 if not w in custom_stopwords_list]
        return " ".join(filtered_words)

    # opening the file in read mode
    my_file = open('data/' + name_of_custom_stopwords_file + '.txt', "r")
    # reading the file
    data = my_file.read()
    custom_stopwords_list_raw = data.split('\n')
    custom_stopwords_list = []
    for word in custom_stopwords_list_raw:
        if len(word) > 0 and not word.startswith('#'):
            custom_stopwords_list.append(word.split(':')[0])
    print("Removing custom stopwords from input data...")
    time_before_operation = time.perf_counter()
    X_out = np.empty_like(X)
    tokenizer = RegexpTokenizer(r'\w+')
    for index, company_description in enumerate(X):
        X_out[index] = RemoveCustomStopwordsFromXRow(company_description, custom_stopwords_list, tokenizer)
        if index % print_index_interval == print_index_interval - 1:
            print("Reached index {} ({} seconds)".format(index, GetTimeOfOperation(time_before_operation)))
    print("Finished removing custom stopwords ({} seconds)".format(GetTimeOfOperation(time_before_operation)))
    return X_out

# Utility function for measuring how long an operation takes to finish.
def GetTimeOfOperation(time_before_operation):
    return time.perf_counter() - time_before_operation

# Uncomment to output a list of the most common words in the input data.
#OutputListOfMostCommonWords(X_raw, 10000, 1000)

# Removes custom stopwords from input data if a custom stopwords list has been assigned.
if args['name_of_custom_stopwords_file']:
    X_raw = RemoveCustomStopwords(X_raw, args['name_of_custom_stopwords_file'], 100000)

# Uncomment to output a new CSV file with a replaced input data column.
#OutputCSVWithReplacedXColumn(np_data, X_raw)

# Convert class label strings to integers.
y_raw = sni_numbers_column
encoder = LabelEncoder()
encoder.fit(y_raw)
y = encoder.transform(y_raw)

# Flatten input matrix to vector.
X_raw = X_raw.ravel()
#print("Examples: {}".format(X_raw.shape[0]))
#print("Possible categories:",np.unique(y_raw),"encoded to",np.unique(y))

IndexError: list index out of range

## Filter low occuring labels

In [2]:
# If you get the error 'ModuleNotFoundError: No module named 'numpy_indexed',
# open the Anaconda Prompt and run the following command:
# 'conda install numpy-indexed -c conda-forge'
import numpy_indexed as npi
if args['filter_low_occuring_labels']['on']:
    print('Initiating filtering of low occuring labels...')
    train_y = y
    train_X = X_raw
    samples_mask = npi.multiplicity(train_y) >= 10
    y= train_y[samples_mask]
    X_raw = train_X[samples_mask]
else:
    print('Skipped filtering of low occuring labels.')

# Convert to bag of words

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X = count_vect.fit_transform(X_raw)
print(X.shape)

(615158, 159515)


## Convert from occurencies to frequencies

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer().fit(X)
X = tf_transformer.transform(X)
print(X.shape)

(615158, 159515)


## Split into training and test data


In [5]:
from sklearn.model_selection import train_test_split
from collections import Counter
print(y)
print(Counter(y))
print('full y')

if args["split_training_and_testing_data"]["stratify"]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args["split_training_and_testing_data"]["test_size"], stratify=y, random_state=args["random_state"])
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args["split_training_and_testing_data"]["test_size"], random_state=args["random_state"])

print(Counter(y_train))
print('trained y ')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

[60 60 58 ... 56 55 56]
Counter({60: 62710, 58: 61298, 38: 50310, 41: 35313, 53: 32525, 40: 31886, 61: 31019, 55: 24761, 59: 21544, 74: 21395, 48: 20450, 36: 18236, 39: 15116, 42: 14857, 64: 13585, 83: 12438, 73: 11450, 70: 9186, 77: 8230, 63: 8021, 0: 7276, 67: 6764, 50: 6638, 80: 6506, 22: 6207, 49: 5334, 57: 5054, 66: 4991, 47: 4364, 1: 3878, 30: 3755, 62: 3127, 45: 3058, 54: 2804, 7: 2658, 71: 2646, 37: 2439, 25: 2301, 13: 2268, 76: 2180, 31: 1966, 68: 1893, 29: 1856, 75: 1615, 15: 1503, 28: 1266, 23: 1146, 82: 1116, 19: 1067, 65: 865, 43: 810, 20: 791, 52: 753, 69: 748, 24: 745, 26: 739, 34: 732, 8: 670, 17: 658, 10: 622, 27: 604, 2: 423, 11: 395, 5: 384, 46: 357, 21: 303, 14: 276, 81: 247, 44: 245, 79: 230, 33: 229, 78: 208, 56: 182, 12: 167, 6: 136, 51: 129, 18: 127, 35: 107, 32: 92, 72: 73, 9: 43, 4: 33, 16: 29})
full y
Counter({60: 50037, 58: 49057, 38: 40338, 41: 28255, 53: 26002, 40: 25451, 61: 24771, 55: 19920, 59: 17252, 74: 17160, 48: 16359, 36: 14537, 39: 12094, 42: 1189

## UnderSampling

In [None]:
# If you get the error 'ModuleNotFoundError: No module named 'numpy_indexed',
# open the Anaconda Prompt and run the following command:
# 'conda install imblearn.under_sampling -c conda-forge'
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

block_name = 'under_sampling'
if args[block_name]['on']:
    print('Initiating {}...'.format(block_name))
    print(X.shape)
    print(y.shape)

    #y_new = y.reshape(-1,1)

    #print(y_new.shape)
    #print(Counter(X))
    undersample = RandomUnderSampler(sampling_strategy=args['under_sampling']["sampling_strategy"], random_state=args["random_state"])
    X, y = undersample.fit_resample(X, y.ravel())
    print(Counter(y))
    print(X.shape)
    print(y.shape)
    #print(Counter(y_under))
    print('Finished {}.'.format(block_name))
else:
    print('Skipped {}.'.format(block_name))

## RandomOverSampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

block_name = 'random_oversampling'
if args[block_name]['on']:
    print('Initiating {}...'.format(block_name))
    #print(X.shape)
    #print(y.shape)

    #y_new1 = y.reshape(-1,1)

    #print(Counter(y))

    ros = RandomOverSampler(sampling_strategy=args['random_oversampling']["sampling_strategy"], random_state=args["random_state"])
    X_train, y_train = ros.fit_resample(X_train, y_train.ravel())
    print(y_train.shape)
    print(Counter(y_train))
    print('second round')
    X_train, y_train = ros.fit_resample(X_train, y_train.ravel())
    print(y_train.shape)
    print(Counter(y_train))
    print('Finished {}.'.format(block_name))
else:
    print('Skipped {}.'.format(block_name))

## Combine Under and Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

block_name = 'combine_under_and_oversampling'
if args[block_name]['on']:
    print('Initiating {}...'.format(block_name))
    over = RandomOverSampler(sampling_strategy=args['combine_under_and_oversampling']["over_sampling_strategy"])

    X_train, y_train = over.fit_resample(X_train, y_train.ravel())

    under = RandomUnderSampler(sampling_strategy=args['combine_under_and_oversampling']["under_sampling_strategy"])

    X_train, y_train = under.fit_resample(X_train, y_train.ravel())
    print('Finished {}.'.format(block_name))
else:
    print('Skipped {}.'.format(block_name))

## SMOTE

In [6]:
# If you get the error 'ModuleNotFoundError: No module named 'numpy_indexed',
# open the Anaconda Prompt and run the following commands:
# 'conda install -c conda-forge imbalanced-learn'
# 'conda install imblearn.over_sampling -c conda-forge'
from imblearn.over_sampling import SMOTE
from collections import Counter

block_name = 'SMOTE'
if args[block_name]['on']:
    print('Initiating {}...'.format(block_name))
    print(X.shape)
    print(y.shape)
    #k_neighbors=1,
    sm = SMOTE(
        random_state=args["random_state"],
        k_neighbors=args["SMOTE"]["k_neighbors"],
        sampling_strategy=args['SMOTE']["sampling_strategy"])
    X_train, y_train = sm.fit_resample(X_train, y_train.ravel())
    #X_train, y_train = sm.fit_resample(X, y.ravel())
    print(Counter(y_train))
    print(X_train.shape)
    print(y_train.shape)
    print('Finished {}.'.format(block_name))
else:
    print('Skipped {}.'.format(block_name))

(615158, 159515)
(615158,)
Counter({50: 50037, 41: 50037, 38: 50037, 58: 50037, 67: 50037, 60: 50037, 55: 50037, 61: 50037, 74: 50037, 53: 50037, 1: 50037, 48: 50037, 42: 50037, 59: 50037, 64: 50037, 73: 50037, 30: 50037, 40: 50037, 66: 50037, 0: 50037, 25: 50037, 45: 50037, 76: 50037, 39: 50037, 83: 50037, 70: 50037, 22: 50037, 49: 50037, 47: 50037, 36: 50037, 62: 50037, 77: 50037, 24: 50037, 31: 50037, 63: 50037, 13: 50037, 34: 50037, 23: 50037, 37: 50037, 29: 50037, 8: 50037, 52: 50037, 54: 50037, 19: 50037, 80: 50037, 14: 50037, 71: 50037, 43: 50037, 17: 50037, 28: 50037, 68: 50037, 7: 50037, 10: 50037, 65: 50037, 26: 50037, 75: 50037, 79: 50037, 27: 50037, 15: 50037, 57: 50037, 82: 50037, 35: 50037, 69: 50037, 44: 50037, 20: 50037, 32: 50037, 46: 50037, 5: 50037, 2: 50037, 4: 50037, 56: 50037, 81: 50037, 21: 50037, 12: 50037, 33: 50037, 18: 50037, 11: 50037, 6: 50037, 78: 50037, 9: 50037, 51: 50037, 16: 50037, 72: 50037})
(4153071, 159515)
(4153071,)


## Function for evaluating model accuracy

In [7]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

def evaluateTesting(model, name):
    
    def AddToLoggedResults(lr, test_type):
        lr[test_type] = {}
        lr[test_type]['accuracy'] = accuracy
        if log_confusion_matrix:
            lr[test_type]['confusion_matrix'] = conf_mx.tolist()
        if test_type == '5_fold_cv':
            lr[test_type]['classification_report'] = classification_report(y, y_pred)
        else:
            lr[test_type]['classification_report'] = classification_report(y_test, y_pred)
    
    logged_results[name] = {}

    print("-- Training data --")
    # train model on training dataset
    model.fit(X_train, y_train)
    # evaluate dataset
    y_pred = model.predict(X_test)
    print("Y Pred: ",  (y_pred.shape))
    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(y_test, y_pred)
    print(conf_mx)
    print(classification_report(y_test, y_pred))
    
    AddToLoggedResults(logged_results[name], 'split')
    
    print("")
    print("-- 5-fold CV --")
    # 5-fold CV
    y_pred = cross_val_predict(model, X, y, cv=5)
    # calculate accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Average accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(y, y_pred)
    print(conf_mx)
    print(classification_report(y, y_pred))
    
    AddToLoggedResults(logged_results[name], '5_fold_cv')
    
    #logged_results[name]['5_fold_cv'] = {}
    #logged_results[name]['5_fold_cv']['accuracy'] = accuracy
    #if log_confusion_matrix:
    #    logged_results[name]['5_fold_cv']['confusion_matrix'] = conf_mx.tolist()
    #logged_results[name]['5_fold_cv']['classification_report'] = classification_report(y, y_pred)

## Naive Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

block_name = 'naive_bayes'
if args[block_name]['on']:
    print('Initiating {}...'.format(block_name))
    model = MultinomialNB(alpha=args[block_name]['alpha'])
    evaluateTesting(model, 'naive_bayes')
    print('Finished {}.'.format(block_name))
else:
    print('Skipped {}.'.format(block_name))

-- Training data --
Y Pred:  (123032,)
Accuracy: 50.66%
Confusion Matrix:
[[ 941   54    2 ...    0    1   17]
 [  50  551    4 ...    1    0    0]
 [   2    1   58 ...    0    0    0]
 ...
 [   0    0    0 ...   10    0    2]
 [   0    1    0 ...    0   88    0]
 [  11    3    0 ...    1    7 1713]]
              precision    recall  f1-score   support

           0       0.54      0.65      0.59      1445
           1       0.57      0.68      0.62       812
           2       0.40      0.72      0.52        80
           4       0.04      0.43      0.07         7
           5       0.16      0.55      0.25        65
           6       0.09      0.29      0.14        21
           7       0.28      0.55      0.37       530
           8       0.30      0.72      0.43       128
           9       0.11      0.67      0.18         6
          10       0.16      0.42      0.23       115
          11       0.04      0.30      0.08        84
          12       0.05      0.24      0.09      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.58      0.62      0.60      7276
           1       0.71      0.53      0.61      3878
           2       0.80      0.34      0.48       423
           4       0.00      0.00      0.00        33
           5       0.61      0.18      0.28       384
           6       0.68      0.17      0.27       136
           7       0.46      0.30      0.36      2658
           8       0.65      0.34      0.45       670
           9       0.00      0.00      0.00        43
          10       0.34      0.09      0.14       622
          11       0.16      0.01      0.02       395
          12       0.50      0.02      0.03       167
          13       0.49      0.26      0.34      2268
          14       0.42      0.03      0.05       276
          15       0.56      0.33      0.41      1503
          16       1.00      0.03      0.07        29
          17       0.24      0.04      0.07       658
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [10]:
from sklearn import svm

block_name = 'svm'
if args[block_name]['on']:
    print('Initiating {}...'.format(block_name))
    model = svm.LinearSVC(
        random_state=args["random_state"],
        C=args[block_name]['C'],
        class_weight=args[block_name]['class_weight'])
    evaluateTesting(model, 'svm')
    print('Finished {}.'.format(block_name))
else:
    print('Skipped {}.'.format(block_name))

-- Training data --
Y Pred:  (123032,)
Accuracy: 49.87%
Confusion Matrix:
[[ 996   20    1 ...    0    3   12]
 [  51  575    2 ...    0    1    2]
 [   3    0   57 ...    0    0    0]
 ...
 [   2    1    1 ...    5    0    1]
 [   2    0    0 ...    1   89    2]
 [  19    3    0 ...    0    8 1745]]
              precision    recall  f1-score   support

           0       0.47      0.69      0.56      1445
           1       0.57      0.71      0.63       812
           2       0.43      0.71      0.54        80
           4       0.00      0.00      0.00         7
           5       0.13      0.49      0.20        65
           6       0.02      0.19      0.03        21
           7       0.31      0.59      0.41       530
           8       0.40      0.66      0.50       128
           9       0.33      0.67      0.44         6
          10       0.10      0.37      0.16       115
          11       0.06      0.30      0.10        84
          12       0.16      0.29      0.21      

## Pipeline example

In [11]:
from sklearn.pipeline import Pipeline
#from imblearn.pipeline import Pipeline

block_name = 'pipeline'
if args[block_name]['on']:
    print(X.shape)
    #print(X_raw.shape)
    X = X_raw.ravel()
    #X = X.ravel()
    #print(X.shape)

    #X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=args["random_state"])

    #how do i get in SMOTE in pipeline, just get error: lower not found. 
    model = Pipeline([('vect', CountVectorizer(stop_words='english')),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB(alpha=.01)),])

    evaluateTesting(model, 'pipeline')
    print('Finished {}.'.format(block_name))
else:
    print('Skipped {}.'.format(block_name))

(615158, 159515)
-- Training data --
Y Pred:  (123032,)
Accuracy: 55.34%
Confusion Matrix:
[[ 884   16    0 ...    0    0   17]
 [  58  440    0 ...    1    0    0]
 [   3    1   33 ...    0    0    0]
 ...
 [   0    0    0 ...    6    0    1]
 [   0    0    0 ...    0   35    2]
 [  12    2    0 ...    0    2 1669]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.59      0.61      0.60      1445
           1       0.74      0.54      0.62       812
           2       0.89      0.41      0.56        80
           4       0.00      0.00      0.00         7
           5       0.70      0.22      0.33        65
           6       0.60      0.14      0.23        21
           7       0.47      0.29      0.36       530
           8       0.74      0.34      0.46       128
           9       0.00      0.00      0.00         6
          10       0.52      0.11      0.19       115
          11       0.25      0.01      0.02        84
          12       1.00      0.03      0.06        34
          13       0.55      0.27      0.36       448
          14       0.50      0.02      0.03        66
          15       0.58      0.36      0.44       274
          16       0.00      0.00      0.00         6
          17       0.12      0.01      0.01       131
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.59      0.62      0.60      7276
           1       0.73      0.53      0.61      3878
           2       0.80      0.34      0.47       423
           4       0.00      0.00      0.00        33
           5       0.62      0.20      0.30       384
           6       0.71      0.18      0.28       136
           7       0.50      0.29      0.37      2658
           8       0.70      0.34      0.46       670
           9       0.00      0.00      0.00        43
          10       0.38      0.08      0.13       622
          11       0.17      0.01      0.01       395
          12       0.75      0.02      0.04       167
          13       0.53      0.25      0.34      2268
          14       0.46      0.02      0.04       276
          15       0.59      0.32      0.42      1503
          16       0.00      0.00      0.00        29
          17       0.31      0.03      0.05       658
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


## Test Logging

In [2]:
import pandas as pd
import locale
from datetime import datetime as dt

#log = False
#log_name = ''

locale.setlocale(locale.LC_ALL, 'FR')

filename = dt.now().strftime("%Y%m%d-%H%M%S")
if log_name:
    filename = filename + '-' + log_name

# Function for easily adding Accuracy scores to the new DataFrame row.
def AddResultToDF(algorithm, testing_type):
    if algorithm in logged_results and testing_type in logged_results[algorithm] and 'accuracy' in logged_results[algorithm][testing_type]:
        accuracy = logged_results[algorithm][testing_type]['accuracy']
        return locale.format('%.2f', (accuracy * 100.0))
    else:
        return -1

if log:
    # Add JSON file containing input parameters and results to test-logs folder.
    output_json = {}
    output_json["input_parameters"] = args
    output_json["model_test_results"] = logged_results

    with open("test-logs/{}.json".format(filename), "w") as outfile:
        json.dump(output_json, outfile, indent=2)
    
    # Add results to generated-test-reports Excel file.
    log_df = pd.read_excel("test-logs/generated-test-reports.xlsx")
    log_df.loc[len(log_df.index)] = [
        filename,
        dt.now().strftime("%Y-%m-%d %H:%M:%S"),
        AddResultToDF('naive_bayes', 'split'),
        AddResultToDF('naive_bayes', '5_fold_cv'),
        AddResultToDF('svm', 'split'),
        AddResultToDF('svm', '5_fold_cv'),
        AddResultToDF('pipeline', 'split'),
        AddResultToDF('pipeline', '5_fold_cv'),
        ''
    ]
    log_df.to_excel("test-logs/generated-test-reports.xlsx", index=False, float_format="%.2f")

NameError: name 'log_name' is not defined