In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# Level | Level for Humans | Level Description                  
# -------|------------------|------------------------------------ 
#  0     | DEBUG            | [Default] Print all messages       
#  1     | INFO             | Filter out INFO messages           
#  2     | WARNING          | Filter out INFO & WARNING messages 
#  3     | ERROR            | Filter out all messages     

import pandas as pd
import numpy as np
import tensorflow as tf
# the tf settings below ALSO GOVERNS ALL OTHER LOGGERS!
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)  # or any {DEBUG, INFO, WARN, ERROR, FATAL}
import random
import pickle
import spacy
import sys
sys.path.append('..')
from pathlib import Path
import logging
import configparser

import ui_utils
import nlp_tools

In [2]:
config = configparser.ConfigParser()
config.read('./config_core_train.cfg')

OUTPUT_DIR = config.get('data', 'OUTPUT_DIR')
FILE_NOTE = config.get('data', 'FILE_NOTE')
APPLY_FILE = config.get('applying', 'APPLY_FILE')

OUTPUT_PATH = f"{OUTPUT_DIR}{FILE_NOTE}/"
SCORE_PATH = f"{OUTPUT_PATH}scores"
TAG_PATH = f"{OUTPUT_PATH}tags"

In [3]:
# initialize logger
root_logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s: %(levelname)s:: %(message)s')

# prints to file
logfile = f"{OUTPUT_PATH}apply_logs.log"
file_handler = logging.FileHandler(logfile, mode='a')
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)

console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
root_logger.addHandler(console_handler)

root_logger.setLevel(logging.INFO)

# print = root_logger.info

In [4]:
print(">>> Using Tensorflow 2 with CPU for this scoring script:")
print(tf.__version__)
print(tf.config.experimental.list_physical_devices('CPU'))
cpus = tf.config.experimental.list_physical_devices('CPU')

print(">>> Loading apply dataframes")
apply_df = pd.read_csv(APPLY_FILE)

>>> Using Tensorflow 2 with CPU for this scoring script:
2.2.0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
>>> Loading apply dataframes


In [5]:
print(">>> Loading pretrained tokenizer...")
with open(f'{OUTPUT_PATH}tokenizer.pickle', 'rb') as file:
    tokenizer = pickle.load(file)
print ('>>> Vocabulary size: {:,}'.format(len(tokenizer.word_index.items())))

>>> Loading pretrained tokenizer...
>>> Vocabulary size: 418


In [6]:
def obtain_tag_clfs(train_output_dir, tag_prefix='Tag_'):
    """
    function to load all trained models from training module output
    train_output_dir: str. direct path of the training output
    tag_prefix: str. the prefix to the folders saving each trained model
    =======output=======
    taggers_dict: dict. Dictionary consisting of {tag_name: model} pairs
    """
    dirs = [f.name for f in os.scandir(train_output_dir) if f.is_dir() and f.name.startswith(tag_prefix)]
    taggers_dict = {}
    for dirname in dirs:
        #print(dirname)
        tag_name = dirname.replace(tag_prefix, '').replace('_training', '')
        #print(tag_name)
        model = tf.keras.models.load_model(
            f'{train_output_dir}{dirname}/{tag_prefix}{tag_name}_model.h5')
        taggers_dict[tag_name] = model
        print(f"Loaded trained model for {tag_name}!")
    return taggers_dict

In [7]:
taggers_dict = obtain_tag_clfs(OUTPUT_PATH)

Loaded trained model for irrelevant!
Loaded trained model for lock!
Loaded trained model for sign!
Loaded trained model for light!
Loaded trained model for doorbell!
Loaded trained model for gate!
Loaded trained model for key!
Loaded trained model for intercom!
Loaded trained model for camera!
Loaded trained model for dog!
Loaded trained model for police!
Loaded trained model for none!


In [8]:
taggers_dict['gate'].summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 300)         125700    
_________________________________________________________________
bidirectional_10 (Bidirectio (None, None, 128)         186880    
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_10 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 129       
Total params: 428,037
Trainable params: 428,037
Non-trainable params: 0
________________________________________________

In [9]:
def generate_label_with_single_model(
    text_ls, trained_tokenizer, trained_model,
    batch_size=100, clf_threshold=0.8):
    """
    function to load one trained model and predict outputs for a list of texts
    text_ls: list of pure texts to be scored by trained model
    trained_tokenizer: loaded trained tokenizer used by training module
    trained_model: loaded trained model
    batch_size: batch size to be fed for scoring
    clf_threshold: float. between 0 and 1. threshold to assign 1 to a text
    """
    print(f"Input text has {len(text_ls):,} text pieces")
    print("Creating an input text Dataset for binary classfication")
    text_ls = [str(text) for text in text_ls]
    text_ds = tf.data.Dataset.from_generator(
        lambda: iter(trained_tokenizer.texts_to_sequences_generator(text_ls)),
        output_shapes=[None,], output_types=tf.int32)\
        .padded_batch(batch_size=batch_size, padded_shapes=[None,], padding_values=0)
    print("Loading trained classification model")
    print("Generating model scores with trained model")
    print("WARNING: Scoring is processed using CPUs only!")
    with tf.device('/CPU:0'):
        pred_proba = trained_model.predict(text_ds).flatten()
    print(f"Label score thredhold is set to {clf_threshold: .2f}")
    pred_label = (pred_proba > clf_threshold).astype(int).flatten()
    print(f"Generated labels has shape {pred_label.shape}")
    return pred_proba, pred_label

def generate_labels_with_model_dict(
    text_ls, trained_tokenizer, model_dict,
    batch_size=100, clf_threshold=0.8):
    labels_dict = {}
    for tag_name in model_dict:
        print("======================================")
        print(f">>> Scoring texts with trained model for {tag_name.upper()}")
        proba_arr, label_arr = generate_label_with_single_model(
            text_ls=text_ls, trained_tokenizer=trained_tokenizer,
            trained_model=model_dict[tag_name],
            batch_size=batch_size, clf_threshold=clf_threshold)
        labels_dict[f"label_{tag_name}"] = label_arr
        labels_dict[f"proba_{tag_name}"] = proba_arr
        print(f">>> Scoring for {tag_name.upper()} completed!")
    return labels_dict

In [10]:
# testing the single model scoring function
all_gate_proba, all_gate_label = generate_label_with_single_model(
    apply_df.Text, tokenizer,
    taggers_dict['gate'], batch_size=100, clf_threshold=0.5)

Input text has 2,502 text pieces
Creating an input text Dataset for binary classfication
Loading trained classification model
Generating model scores with trained model
Label score thredhold is set to  0.50
Generated labels has shape (2502,)


In [11]:
apply_df.loc[all_gate_label == 1, 'Text'].tolist()[:10]

['I trailed behind to get thru gate',
 'Locked gate with a pad lock',
 'This apartment complex is gated. Not able to gain access. Unable to visit selected unit',
 'Gated front porch but accessible.',
 'Gate was unlocked.',
 'Gated community, but gate open at this time',
 'Townhouses/ apartments called The Lodge; gated area; Listed as 53-14 on the door; very nice area; well maintained and clean',
 'Gated community with a telephone gatekeeper system.',
 'Gated fence that can be owned by visitors',
 'Gated community. Unable to access without code']

In [12]:
%%time
# testing the all model scoring function
labels_dict = generate_labels_with_model_dict(
    apply_df.Text, tokenizer,
    taggers_dict, batch_size=100, clf_threshold=0.7)

>>> Scoring texts with trained model for IRRELEVANT
Input text has 2,502 text pieces
Creating an input text Dataset for binary classfication
Loading trained classification model
Generating model scores with trained model
Label score thredhold is set to  0.70
Generated labels has shape (2502,)
>>> Scoring for IRRELEVANT completed!
>>> Scoring texts with trained model for LOCK
Input text has 2,502 text pieces
Creating an input text Dataset for binary classfication
Loading trained classification model
Generating model scores with trained model
Label score thredhold is set to  0.70
Generated labels has shape (2502,)
>>> Scoring for LOCK completed!
>>> Scoring texts with trained model for SIGN
Input text has 2,502 text pieces
Creating an input text Dataset for binary classfication
Loading trained classification model
Generating model scores with trained model
Label score thredhold is set to  0.70
Generated labels has shape (2502,)
>>> Scoring for SIGN completed!
>>> Scoring texts with train

In [13]:
pd.options.display.float_format = '{:,.4f}'.format
pd.DataFrame(labels_dict)

Unnamed: 0,label_irrelevant,proba_irrelevant,label_lock,proba_lock,label_sign,proba_sign,label_light,proba_light,label_doorbell,proba_doorbell,...,label_intercom,proba_intercom,label_camera,proba_camera,label_dog,proba_dog,label_police,proba_police,label_none,proba_none
0,0,0.0000,0,0.3567,0,0.4621,0,0.3245,0,0.5073,...,0,0.0000,0,0.4996,0,0.5051,0,0.0000,0,0.0000
1,0,0.0000,0,0.1394,0,0.3983,0,0.0705,0,0.4910,...,0,0.0000,0,0.5027,0,0.5052,0,0.0000,0,0.0000
2,1,1.0000,0,0.1102,0,0.3564,0,0.5831,0,0.4868,...,0,0.0000,0,0.4927,0,0.5087,0,0.0000,0,0.0000
3,0,0.0000,0,0.6939,0,0.3805,0,0.2141,0,0.4926,...,0,0.0003,0,0.5024,0,0.5086,0,0.0000,0,0.0000
4,0,0.0000,0,0.3847,0,0.4921,0,0.2169,0,0.4956,...,0,0.0048,0,0.5034,0,0.5053,0,0.0000,1,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2497,0,0.0000,0,0.2760,0,0.3402,0,0.3058,0,0.4861,...,0,0.0000,0,0.4948,0,0.5083,0,0.0000,0,0.0000
2498,0,0.0000,0,0.3007,0,0.3861,0,0.1956,0,0.4917,...,0,0.0000,0,0.4910,0,0.5135,0,0.0000,0,0.0000
2499,0,0.0000,0,0.2412,1,0.7175,0,0.2629,0,0.4917,...,1,1.0000,0,0.4994,0,0.5167,0,0.0000,0,0.0000
2500,0,0.0000,0,0.1244,0,0.5021,0,0.4198,0,0.4990,...,0,0.0000,0,0.5055,0,0.5109,0,0.0000,0,0.0000


In [14]:
apply_scored_df = pd.concat([apply_df, pd.DataFrame(labels_dict)], axis=1)

In [15]:
apply_scored_df.to_csv('./output/test.csv', index=False)