This notebook focuses on developing standardized analytics on standard `project output` from `wrapper_al` folder.

In [1]:
import configparser
from dateutil.parser import parse

import pandas as pd
import numpy as np
import ui_utils
import os
from sklearn.metrics import roc_auc_score, f1_score, precision_score,\
    recall_score, classification_report, accuracy_score

## Create Parser Class for Meta Project

In [2]:
project_path = './projects/security_5x100/'
meta_cfg_path = f'{project_path}orchestration_record.cfg'
meta_log_path = f'{project_path}orchestration_log.log'

In [3]:
meta_config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
meta_config.read(meta_cfg_path)

['./projects/security_5x100/orchestration_record.cfg']

In [4]:
TOTAL_ROUNDS = int(meta_config.get('active_learning', 'total_rounds'))
ANSWER_FILE = meta_config.get('coder_sim', 'answer_file')
MAX_TAGS = int(meta_config.get('training', 'max_tags'))

In [5]:
# test log file parsing for execution time info
with open(meta_log_path, 'r') as logfile:
    first_line = logfile.readline()
    for last_line in logfile:
        pass
    try:
        start_time = parse(first_line[:23])
        end_time = parse(last_line[:23])
        run_time = (end_time - start_time).seconds
        print(f"Execution of the program took {run_time / 60:,.2f} minutes")
    except:
        print("Program log file indicates that program run was not successfully executed...")

Execution of the program took 141.78 minutes


In [6]:
# test a basic description function
def describe_project_meta(meta_config):
    stmts = []
    stmts.append('INTRO\n-------')
    stmts.append(f"\nThis Active Learning Run has a round count of {int(meta_config.get('active_learning', 'total_rounds')):,}.")
    if meta_config.get('active_learning', 'run_sim') == '1':
        stmts.append("This run is a simulation with known tags already available.")
    else:
        stmts.append("This run is an actual application with manual coder input for tags on the fly.")
    stmts.append(f"For each round, {int(meta_config.get('sampling', 'sample_size')):,} samples are to be selected as additional training data.")
    stmts.append(f"While the first round always runs random sampling to gather the samples,")
    stmts.append(f"the second and beyond rounds use {meta_config.get('sampling', 'sampling_method')} method.")
    stmts.append("\nThe training config for each round's Bi-Directional LSTM modeling is as below:")
    for key, value in dict(meta_config['training']).items():
        stmts.append(f"\n\t{key}: {value}")
    if meta_config.get('training', 'random_embed') == 'True':
        stmts.append('\nThe text embeddings are randomly initiated 300-length via Tensorflow 2.')
    else:
        stmts.append('\nThe text embeddings are GloVe 300-length text embeddings loaded via Spacy.')
    print(" ".join(stmts))
    

In [7]:
describe_project_meta(meta_config)

INTRO
------- 
This Active Learning Run has a round count of 5. This run is a simulation with known tags already available. For each round, 100 samples are to be selected as additional training data. While the first round always runs random sampling to gather the samples, the second and beyond rounds use clustering method. 
The training config for each round's Bi-Directional LSTM modeling is as below: 
	max_tags: 100 
	batch_size: 50 
	buffer_size: 1000 
	early_stopping_rounds: 30 
	dropout_level: 0.2 
	learning_rate: 0.00001 
	n_lstm_units: 128 
	n_fc_neurons: 256 
	embed_trainable: True 
	random_embed: False 
The text embeddings are GloVe 300-length text embeddings loaded via Spacy.


In [8]:
train_df = pd.read_json("./input/security_text_tags.json", orient='records')
train_df = ui_utils.create_tag_columns(train_df)
tag_sum_df = train_df.loc[:,train_df.columns.str.startswith('Tag_')]\
    .sum().sort_values(ascending=False).reset_index().rename({'index':'Tag_Name', 0: 'Pos_Count'}, axis=1)
tag_sum_df['Pos_Rate'] = tag_sum_df.Pos_Count / train_df.shape[0]
tag_sum_df

Unnamed: 0,Tag_Name,Pos_Count,Pos_Rate
0,Tag_gate,411,0.164269
1,Tag_lock,348,0.139089
2,Tag_video_doorbell,326,0.130296
3,Tag_warning_sign,239,0.095524
4,Tag_lights,148,0.059153
5,Tag_dog,129,0.051559
6,Tag_intercom,123,0.049161
7,Tag_security_screen_door,118,0.047162
8,Tag_guard_ID_resident,108,0.043165
9,Tag_none,104,0.041567


In [9]:
def gen_tag_sum_df(json_path, tag_col='Tag_'):
    """
    Function to generate tag positive ratios of a given DF (stored in JSON format)
    """
    df = pd.read_json(json_path, orient='records')
    df = ui_utils.create_tag_columns(df)
    tag_sum_df = df.loc[:, df.columns.str.startswith(tag_col)]\
        .sum().sort_values(ascending=False).reset_index().rename({'index':'Tag_Name', 0: 'Pos_Count'}, axis=1)
    tag_sum_df['Pos_Rate'] = tag_sum_df.Pos_Count / df.shape[0]
    return tag_sum_df

In [10]:
gen_tag_sum_df("./input/security_text_tags.json")

Unnamed: 0,Tag_Name,Pos_Count,Pos_Rate
0,Tag_gate,411,0.164269
1,Tag_lock,348,0.139089
2,Tag_video_doorbell,326,0.130296
3,Tag_warning_sign,239,0.095524
4,Tag_lights,148,0.059153
5,Tag_dog,129,0.051559
6,Tag_intercom,123,0.049161
7,Tag_security_screen_door,118,0.047162
8,Tag_guard_ID_resident,108,0.043165
9,Tag_none,104,0.041567


In [80]:
class MetaProject(object):
    def __init__(self, project_path, rundir='./wrapper_al/'):
        """
        project_path: path to the project folder of the active learning run
        rundir: the path where the active learning ran, default './wrapper_al/'
        """
        print(">>> Instantiate MetaProject class...")
        self.project_path = project_path
        self.rundir = rundir
        self.cfg_path = os.path.abspath(f'{self.project_path}orchestration_record.cfg')
        self.log_path = os.path.abspath(f'{self.project_path}orchestration_log.log')
        self._load_config()
        self.total_rounds = int(self.config.get('active_learning', 'total_rounds'))
        self.round_sample = int(self.config.get('sampling', 'sample_size'))
        self.total_sample = self.total_rounds * self.round_sample
        # get abspath of the answer file since the exec path of project is different from analytics path
        self.answer_file = os.path.abspath(os.path.join(
            self.rundir, self.config.get('coder_sim', 'answer_file')))
        print(self.answer_file)
        self.max_tags = int(self.config.get('training', 'max_tags'))
        self.run_sim = int(self.config.get('active_learning', 'run_sim'))
        self.run_time = self._parse_log(self.log_path)
        self._gen_tag_sum_df(self.answer_file)
        
    def _load_config(self):
        print(">>> Loading project orchestration config")
        self.config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
        self.config.read(self.cfg_path)

    def _parse_log(self, log_path):
        """
        Method to parse orchestration log file to obtain run duration in seconds
        """
        print(">>> Parsing project execution run time")
        with open(log_path, 'r') as logfile:
            first_line = logfile.readline()
            for last_line in logfile:
                pass
            try:
                start_time = parse(first_line[:23])
                end_time = parse(last_line[:23])
                run_time = (end_time - start_time).seconds
            except:
                print(">>> Project did not run successfully based on log records!")
                run_time = -1
            return run_time
    
    def _gen_tag_sum_df(self, tag_col='Tag_'):
        """
        Method to generate tag positive ratios of a given DF (stored in JSON format)
        """
        print(">>> Reading full dataset...")
        df = pd.read_json(self.answer_file, orient='records')
        df = ui_utils.create_tag_columns(df)
        self.df = df
        self.total_records = df.shape[0]
        if self.run_sim == 1:
            print(">>> Project ran as simulation...")
            self.answer_tag_sum_df = df.loc[:, df.columns.str.startswith(tag_col)].sum().sort_values(
                ascending=False).reset_index().rename(
                {'index':'Tag_Name', 0: 'Pos_Count'}, axis=1)
            self.answer_tag_sum_df['Pos_Rate'] = self.answer_tag_sum_df.Pos_Count / df.shape[0]
        else:
            print(">>> Project ran in real time with manual coders...")
            self.answer_tag_sum_df = None
    
    def describe(self):
        """
        Method to describe the project with Meta Cfg and Logs
        method only loads attributes of the object
        """
        print(">>> Composing project high level description...")
        self.stmts = []
        self.stmts.append('INTRO\n-------')
        self.stmts.append(f"\nThis Active Learning Run has a round count of {self.total_rounds:,},")
        self.stmts.append(f"and a total of {self.total_sample:,} samples are included for model training.")
        if self.run_sim == 1:
            self.stmts.append("This run is a simulation with known tags already available.")
        else:
            self.stmts.append("This run is an actual application with manual coder input for tags on the fly.")
        self.stmts.append(f"In each round, {int(self.config.get('sampling', 'sample_size')):,} samples are selected as additional training data.")
        self.stmts.append(f"While the first round always runs random sampling to gather the samples,")
        self.stmts.append(f"the second and beyond rounds use {self.config.get('sampling', 'sampling_method')} method.")
        self.stmts.append('\n\nDATA\n-------')
        self.stmts.append(f'\nThe input dataframe has a total of {self.total_records:,} records.')
        if self.answer_tag_sum_df is not None:
            self.stmts.append('The positive rates of each tag in the full answer dataset:')
            self.stmts.append("\n" + self.answer_tag_sum_df.to_string())
        self.stmts.append('\n\nMODELING\n-------')
        self.stmts.append("\nThe training config for each round's Bi-Directional LSTM modeling is as below:")
        for key, value in dict(self.config['training']).items():
            self.stmts.append(f"\n\t{key}: {value}")
        if self.config.get('training', 'random_embed') == 'True':
            self.stmts.append('\nThe text embeddings are randomly initiated 300-length via Tensorflow 2.')
        else:
            self.stmts.append('\nThe text embeddings are GloVe 300-length text embeddings loaded via Spacy.')
        self.stmts.append('\n\nRUNTIME\n-------')
        if self.run_time > 0:
            self.stmts.append(f"\nExecution of the run took {self.run_time / 60:,.2f} minutes to complete")
        else:
            self.stmts.append("Program log file indicates that this run was not successfully executed...")
        self.description = " ".join(self.stmts)
        print(">>> Displaying the description:")
        print(self.description)
        

In [81]:
test_project = MetaProject('./projects/security_5x100/')

>>> Instantiate MetaProject class...
>>> Loading project orchestration config
/home/riversome/Documents/NLP_other_specify_classification/input/security_text_tags.json
>>> Parsing project execution run time
>>> Reading full dataset...
>>> Project ran as simulation...


In [82]:
test_project.describe()

>>> Composing project high level description...
>>> Displaying the description:
INTRO
------- 
This Active Learning Run has a round count of 5, and a total of 500 samples are included for model training. This run is a simulation with known tags already available. In each round, 100 samples are selected as additional training data. While the first round always runs random sampling to gather the samples, the second and beyond rounds use clustering method. 

DATA
------- 
The input dataframe has a total of 2,502 records. The positive rates of each tag in the full answer dataset: 
Empty DataFrame
Columns: [Tag_Name, Pos_Count, Pos_Rate]
Index: [] 

MODELING
------- 
The training config for each round's Bi-Directional LSTM modeling is as below: 
	max_tags: 100 
	batch_size: 50 
	buffer_size: 1000 
	early_stopping_rounds: 30 
	dropout_level: 0.2 
	learning_rate: 0.00001 
	n_lstm_units: 128 
	n_fc_neurons: 256 
	embed_trainable: True 
	random_embed: False 
The text embeddings are GloVe 300-le

## Create Round Output parser class

In [19]:
train_df = pd.read_csv('./projects/security_5x100/round_3/output/train_df.csv')
scored_df = pd.read_json('./projects/security_5x100/round_3/output/scored/scored_output.json', orient='records')
# create answer_df with binary tag cols
answer_df = pd.read_json('./input/security_text_tags.json')
answer_df = ui_utils.create_tag_columns(answer_df)

In [40]:
# prepare col selectors
proba_cols = scored_df.columns[scored_df.columns.str.startswith('proba_')].tolist()
tag_names = [proba_col.replace('proba_', '').strip() for proba_col in proba_cols]
true_tag_cols = [f"Tag_{tag}" for tag in tag_names]

In [41]:
# prepare row selectors
all_ids = answer_df['UID'].unique()
train_ids = train_df['UID'].unique()
test_ids = [uid for uid in all_ids if uid not in train_ids]

In [79]:
round_outputs = {}
round_results = {}
proba_cutoff = 0.65

for tag_name, proba_col, true_tag_col in zip(tag_names, proba_cols, true_tag_cols):
    round_outputs[tag_name] = {}
    round_results[tag_name] = {}
    # save the y_true, y_pred, and y_proba for train and test runs
    round_outputs[tag_name]['train_y_proba'] = scored_df.loc[scored_df['UID'].isin(train_ids), proba_col].values
    round_outputs[tag_name]['train_y_pred'] = (round_outputs[tag_name]['train_y_proba'] >= proba_cutoff).astype(int)
    round_outputs[tag_name]['train_y_true'] = answer_df.loc[answer_df['UID'].isin(train_ids), true_tag_col].values
    round_outputs[tag_name]['test_y_proba'] = scored_df.loc[scored_df['UID'].isin(test_ids), proba_col].values
    round_outputs[tag_name]['test_y_pred'] = (round_outputs[tag_name]['test_y_proba'] >= proba_cutoff).astype(int)
    round_outputs[tag_name]['test_y_true'] = answer_df.loc[answer_df['UID'].isin(test_ids), true_tag_col].values
    
    # calculate train side metrics
    round_results[tag_name]['train_roc_auc'] = roc_auc_score(
        round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_proba'])
    round_results[tag_name]['train_f1'] = f1_score(
        round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
    round_results[tag_name]['train_precision'] = precision_score(
        round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
    round_results[tag_name]['train_recall'] = recall_score(
        round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
    round_results[tag_name]['train_cr'] = classification_report(
        round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
    round_results[tag_name]['train_f1'] = roc_auc_score(
        round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'])
    round_results[tag_name]['train_pos_rate'] = round_outputs[tag_name]['train_y_true'].sum() \
        / round_outputs[tag_name]['train_y_true'].shape[0]
    # calculate test side metrics
    round_results[tag_name]['test_roc_auc'] = roc_auc_score(
        round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_proba'])
    round_results[tag_name]['test_f1'] = f1_score(
        round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
    round_results[tag_name]['test_precision'] = precision_score(
        round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
    round_results[tag_name]['test_recall'] = recall_score(
        round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
    round_results[tag_name]['test_cr'] = classification_report(
        round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
    round_results[tag_name]['test_f1'] = roc_auc_score(
        round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'])
    round_results[tag_name]['test_pos_rate'] = round_outputs[tag_name]['test_y_true'].sum() \
        / round_outputs[tag_name]['test_y_true'].shape[0]

In [88]:
for tag in round_results.keys():
    print(f"Tag - {tag}:")
    print(f">>> Pos Rate: Train - {round_results[tag]['train_pos_rate'] * 100:.2f}%; Test - {round_results[tag]['test_pos_rate'] * 100:.2f}%")
    print(f">>> ROC AUC: Train - {round_results[tag]['train_roc_auc']:.3f}; Test - {round_results[tag]['test_roc_auc']:.3f}\n")
    print(">>> Classification Reports:")
    print(f">>> Train:\n {round_results[tag]['train_cr']}")
    print(f">>> Test:\n {round_results[tag]['test_cr']}")
    print('======================================\n')

Tag - lock:
>>> Pos Rate: Train - 13.00%; Test - 14.03%
>>> ROC AUC: Train - 1.000; Test - 0.855

>>> Classification Reports:
>>> Train:
               precision    recall  f1-score   support

           0       1.00      0.95      0.98       261
           1       0.76      1.00      0.87        39

    accuracy                           0.96       300
   macro avg       0.88      0.98      0.92       300
weighted avg       0.97      0.96      0.96       300

>>> Test:
               precision    recall  f1-score   support

           0       0.96      0.90      0.93      1893
           1       0.56      0.78      0.65       309

    accuracy                           0.88      2202
   macro avg       0.76      0.84      0.79      2202
weighted avg       0.91      0.88      0.89      2202


Tag - video_doorbell:
>>> Pos Rate: Train - 14.33%; Test - 12.85%
>>> ROC AUC: Train - 0.993; Test - 0.955

>>> Classification Reports:
>>> Train:
               precision    recall  f1-score   su

In [60]:
print(train_cr)

              precision    recall  f1-score   support

           0       1.00      0.95      0.98       261
           1       0.76      1.00      0.87        39

    accuracy                           0.96       300
   macro avg       0.88      0.98      0.92       300
weighted avg       0.97      0.96      0.96       300



In [102]:
class RoundResult(object):
    def __init__(self, round_path, answer_file, proba_cutoff, rundir='./wrapper_al/'):
        self.round_path = os.path.abspath(os.path.join(rundir, round_path))
        print(self.round_path)
        self.config_dir = f"{self.round_path.rstrip('/')}/config/"
        self.sample_dir = f"{self.round_path.rstrip('/')}/sample/"
        self.label_dir = f"{self.round_path.rstrip('/')}/label/"
        self.input_dir = f"{self.round_path.rstrip('/')}/input/"
        self.output_dir = f"{self.round_path.rstrip('/')}/output/"
        self.train_file = f"{self.output_dir.rstrip('/')}/train_df.csv"
        self.scored_file = f"{self.output_dir.rstrip('/')}/scored/scored_output.json"
        self.answer_file = os.path.abspath(os.path.join(rundir, answer_file))
        self.proba_cutoff = proba_cutoff
        self.load_outputs()
        
    def load_outputs(self, proba_prefix='proba_', tag_prefix='Tag_', row_key='UID'):
        # read the round related datasets
        train_df = pd.read_csv(self.train_file)
        scored_df = pd.read_json(self.scored_file, orient='records')
        answer_df = pd.read_json(self.answer_file, orient='records')
        answer_df = ui_utils.create_tag_columns(answer_df)
        # prepare col selectors
        proba_cols = scored_df.columns[scored_df.columns.str.startswith(proba_prefix)].tolist()
        tag_names = [proba_col.replace(proba_prefix, '').strip() for proba_col in proba_cols]
        true_tag_cols = [f"{tag_prefix}{tag}" for tag in tag_names]
        # prepare row selectors
        all_ids = answer_df[row_key].unique()
        train_ids = train_df[row_key].unique()
        test_ids = [uid for uid in all_ids if uid not in train_ids]
        # create 2 dicts for round outputs and results
        round_outputs = {}
        round_results = {}
        for tag_name, proba_col, true_tag_col in zip(tag_names, proba_cols, true_tag_cols):
            round_outputs[tag_name] = {}
            round_results[tag_name] = {}
            # save the y_true, y_pred, and y_proba for train and test runs
            round_outputs[tag_name]['train_y_proba'] = scored_df.loc[scored_df[row_key].isin(train_ids), proba_col].values
            round_outputs[tag_name]['train_y_pred'] = (round_outputs[tag_name]['train_y_proba'] >= self.proba_cutoff).astype(int)
            round_outputs[tag_name]['train_y_true'] = answer_df.loc[answer_df[row_key].isin(train_ids), true_tag_col].values
            round_outputs[tag_name]['test_y_proba'] = scored_df.loc[scored_df[row_key].isin(test_ids), proba_col].values
            round_outputs[tag_name]['test_y_pred'] = (round_outputs[tag_name]['test_y_proba'] >= self.proba_cutoff).astype(int)
            round_outputs[tag_name]['test_y_true'] = answer_df.loc[answer_df[row_key].isin(test_ids), true_tag_col].values

            # calculate train side metrics
            round_results[tag_name]['train_roc_auc'] = roc_auc_score(
                round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_proba'])
            round_results[tag_name]['train_f1'] = f1_score(
                round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
            round_results[tag_name]['train_precision'] = precision_score(
                round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
            round_results[tag_name]['train_recall'] = recall_score(
                round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
            round_results[tag_name]['train_cr'] = classification_report(
                round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'], zero_division=0)
            round_results[tag_name]['train_f1'] = roc_auc_score(
                round_outputs[tag_name]['train_y_true'], round_outputs[tag_name]['train_y_pred'])
            round_results[tag_name]['train_pos_rate'] = round_outputs[tag_name]['train_y_true'].sum() \
                / round_outputs[tag_name]['train_y_true'].shape[0]
            # calculate test side metrics
            round_results[tag_name]['test_roc_auc'] = roc_auc_score(
                round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_proba'])
            round_results[tag_name]['test_f1'] = f1_score(
                round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
            round_results[tag_name]['test_precision'] = precision_score(
                round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
            round_results[tag_name]['test_recall'] = recall_score(
                round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
            round_results[tag_name]['test_cr'] = classification_report(
                round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'], zero_division=0)
            round_results[tag_name]['test_f1'] = roc_auc_score(
                round_outputs[tag_name]['test_y_true'], round_outputs[tag_name]['test_y_pred'])
            round_results[tag_name]['test_pos_rate'] = round_outputs[tag_name]['test_y_true'].sum() \
                / round_outputs[tag_name]['test_y_true'].shape[0]
        self.round_outputs = round_outputs
        self.round_results = round_results
    
    def describe_round_metrics(self):
        self.stmts = []
        for tag in self.round_results.keys():
            self.stmts.append(f"==========Tag - {tag.upper()}==========")
            self.stmts.append(f"\n>>> Pos Rate: Train - {self.round_results[tag]['train_pos_rate'] * 100:.2f}%; Test - {self.round_results[tag]['test_pos_rate'] * 100:.2f}%")
            self.stmts.append(f"\n>>> ROC AUC: Train - {self.round_results[tag]['train_roc_auc']:.3f}; Test - {self.round_results[tag]['test_roc_auc']:.3f}\n")
            self.stmts.append("\n>>> Classification Reports:")
            self.stmts.append(f"\n>>> Train:\n {self.round_results[tag]['train_cr']}")
            self.stmts.append(f"\n>>> Test:\n {self.round_results[tag]['test_cr']}")
            self.stmts.append('\n======================================\n')
        self.description = " ".join(self.stmts)
        print(">>> Displaying the description:")
        print(self.description)

In [107]:
round1_test = RoundResult(round_path='../projects/security_5x100/round_1',
                          answer_file='../input/security_text_tags.json', 
                          proba_cutoff=0.65, rundir='./wrapper_al/')
round1_test.describe_round_metrics()

/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_1
>>> Displaying the description:
>>> Pos Rate: Train - 13.00%; Test - 13.95% 
>>> ROC AUC: Train - 0.618; Test - 0.572
 
>>> Classification Reports: 
>>> Train:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93        87
           1       0.00      0.00      0.00        13

    accuracy                           0.87       100
   macro avg       0.43      0.50      0.47       100
weighted avg       0.76      0.87      0.81       100
 
>>> Test:
               precision    recall  f1-score   support

           0       0.86      1.00      0.93      2067
           1       0.00      0.00      0.00       335

    accuracy                           0.86      2402
   macro avg       0.43      0.50      0.46      2402
weighted avg       0.74      0.86      0.80      2402
 
>>> Pos Rate: Train - 14.00%; Test - 12.99% 
>>> ROC AUC: Train - 1.000; Test - 0

In [125]:
round1_test.round_outputs

{'lock': {'train_y_proba': array([0.4991495 , 0.50369853, 0.52487779, 0.5007059 , 0.48917294,
         0.49927384, 0.48456141, 0.4842332 , 0.51274866, 0.51813251,
         0.49074101, 0.49940288, 0.50267279, 0.52083457, 0.49628112,
         0.50225103, 0.49308953, 0.49618882, 0.50451696, 0.48854801,
         0.49580419, 0.49410215, 0.50203431, 0.50410372, 0.50827903,
         0.50936961, 0.50451696, 0.5081901 , 0.49442697, 0.48039591,
         0.49049428, 0.49392414, 0.49208376, 0.51281846, 0.48910943,
         0.50956136, 0.5196991 , 0.50795978, 0.51793396, 0.50503522,
         0.52597767, 0.50689691, 0.50649416, 0.49723977, 0.5084784 ,
         0.51029712, 0.49844646, 0.488123  , 0.50189185, 0.49579361,
         0.50085944, 0.48363185, 0.49357435, 0.50657445, 0.49074101,
         0.51398093, 0.49202392, 0.50421911, 0.48866042, 0.51976144,
         0.49717069, 0.48911953, 0.50001901, 0.51539564, 0.4936749 ,
         0.51466644, 0.49430498, 0.51455015, 0.48655662, 0.49957329,
         

## Update MetaProject Class with the RoundResult Class

In [150]:
class MetaProjectWithRounds(object):
    def __init__(self, project_path, rundir='./wrapper_al/', alternative_cutoff=None):
        """
        project_path: path to the project folder of the active learning run
        rundir: the path where the active learning ran, default './wrapper_al/'
        """
        print(">>> Instantiate MetaProject class...")
        self.project_path = project_path
        self.rundir = rundir
        self.cfg_path = os.path.abspath(f'{self.project_path}orchestration_record.cfg')
        self.log_path = os.path.abspath(f'{self.project_path}orchestration_log.log')
        self._load_config()
        self.total_rounds = int(self.config.get('active_learning', 'total_rounds'))
        self.round_sample = int(self.config.get('sampling', 'sample_size'))
        self.total_sample = self.total_rounds * self.round_sample
        # get abspath of the answer file since the exec path of project is different from analytics path
        self.answer_file = os.path.abspath(os.path.join(
            self.rundir, self.config.get('coder_sim', 'answer_file')))
        if alternative_cutoff is not None:
            self.proba_cutoff = alternative_cutoff
        else:
            self.proba_cutoff = float(self.config.get('scoring', 'clf_threshold'))
        self.max_tags = int(self.config.get('training', 'max_tags'))
        self.run_sim = int(self.config.get('active_learning', 'run_sim'))
        self.run_time = self._parse_log(self.log_path)
        self._gen_tag_sum_df(self.answer_file)
        self._load_rounds()
        
    def _load_config(self):
        print(">>> Loading project orchestration config")
        self.config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
        self.config.read(self.cfg_path)

    def _parse_log(self, log_path):
        """
        Method to parse orchestration log file to obtain run duration in seconds
        """
        print(">>> Parsing project execution run time")
        with open(log_path, 'r') as logfile:
            first_line = logfile.readline()
            for last_line in logfile:
                pass
            try:
                start_time = parse(first_line[:23])
                end_time = parse(last_line[:23])
                run_time = (end_time - start_time).seconds
            except:
                print(">>> Project did not run successfully based on log records!")
                run_time = -1
            return run_time
    
    def _gen_tag_sum_df(self, tag_col='Tag_'):
        """
        Method to generate tag positive ratios of a given DF (stored in JSON format)
        """
        print(">>> Reading full dataset...")
        df = pd.read_json(self.answer_file, orient='records')
        df = ui_utils.create_tag_columns(df)
        self.df = df
        self.total_records = df.shape[0]
        if self.run_sim == 1:
            print(">>> Project ran as simulation...")
            self.answer_tag_sum_df = df.loc[:, df.columns.str.startswith(tag_col)].sum().sort_values(
                ascending=False).reset_index().rename(
                {'index':'Tag_Name', 0: 'Pos_Count'}, axis=1)
            self.answer_tag_sum_df['Pos_Rate'] = self.answer_tag_sum_df.Pos_Count / df.shape[0]
        else:
            print(">>> Project ran in real time with manual coders...")
            self.answer_tag_sum_df = None
    
    def _load_rounds(self):
        print(">>> Loading results for each round...")
        self.rounds = {}
        self.round_results = {}
        self.round_outputs = {}
        self.round_descriptions = {}
        for round_id in range(self.total_rounds):
            config_project_path = self.config.get('active_learning', 'project_path')
            round_path = f"{config_project_path.rstrip('/')}/round_{round_id + 1}/"
            self.rounds[round_id + 1] = RoundResult(
                round_path=round_path, answer_file=self.answer_file, 
                proba_cutoff=self.proba_cutoff)
            self.round_results[round_id + 1] = self.rounds[round_id + 1].round_results
            self.round_outputs[round_id + 1] = self.rounds[round_id + 1].round_outputs
        self._flatten_results()
    
    def _flatten_results(self):
        self.flatten_result_dict = {}
        self.flatten_result_dict['round_id'] = []
        self.flatten_result_dict['tag_name'] = []

        for round_id, round_result in self.round_results.items():
            for tag_name, model_scores in round_result.items():
                self.flatten_result_dict['round_id'].append(round_id)
                self.flatten_result_dict['tag_name'].append(tag_name)
                for metric_name, metric_value in model_scores.items():
                    if metric_name not in self.flatten_result_dict.keys():
                        self.flatten_result_dict[metric_name] = [metric_value]
                    else:
                        self.flatten_result_dict[metric_name].append(metric_value)
        self.flatten_result_df = pd.DataFrame(self.flatten_result_dict)
    
    def describe(self):
        """
        Method to describe the project with Meta Cfg and Logs
        method only loads attributes of the object
        """
        print(">>> Composing project high level description...")
        self.stmts = []
        self.stmts.append('INTRO\n-------')
        self.stmts.append(f"\nThis Active Learning Run has a round count of {self.total_rounds:,},")
        self.stmts.append(f"and a total of {self.total_sample:,} samples are included for model training.")
        if self.run_sim == 1:
            self.stmts.append("This run is a simulation with known tags already available.")
        else:
            self.stmts.append("This run is an actual application with manual coder input for tags on the fly.")
        self.stmts.append(f"In each round, {int(self.config.get('sampling', 'sample_size')):,} samples are selected as additional training data.")
        self.stmts.append(f"While the first round always runs random sampling to gather the samples,")
        self.stmts.append(f"the second and beyond rounds use {self.config.get('sampling', 'sampling_method')} method.")
        self.stmts.append('\n\nDATA\n-------')
        self.stmts.append(f'\nThe input dataframe has a total of {self.total_records:,} records.')
        if self.answer_tag_sum_df is not None:
            self.stmts.append('The positive rates of each tag in the full answer dataset:')
            self.stmts.append("\n" + self.answer_tag_sum_df.to_string())
        self.stmts.append('\n\nMODELING\n-------')
        self.stmts.append("\nThe training config for each round's Bi-Directional LSTM modeling is as below:")
        for key, value in dict(self.config['training']).items():
            self.stmts.append(f"\n\t{key}: {value}")
        if self.config.get('training', 'random_embed') == 'True':
            self.stmts.append('\nThe text embeddings are randomly initiated 300-length via Tensorflow 2.')
        else:
            self.stmts.append('\nThe text embeddings are GloVe 300-length text embeddings loaded via Spacy.')
        self.stmts.append('\n\nRUNTIME\n-------')
        if self.run_time > 0:
            self.stmts.append(f"\nExecution of the run took {self.run_time / 60:,.2f} minutes to complete")
        else:
            self.stmts.append("Program log file indicates that this run was not successfully executed...")
        self.description = " ".join(self.stmts)
        print(">>> Displaying the description:")
        print(self.description)
        

In [151]:
full_test_project = MetaProjectWithRounds('./projects/security_5x100/')

>>> Instantiate MetaProject class...
>>> Loading project orchestration config
>>> Parsing project execution run time
>>> Reading full dataset...
>>> Project ran as simulation...
>>> Loading results for each round...
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_1
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_2
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_3
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_4
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_5


In [152]:
full_test_project.flatten_result_df

Unnamed: 0,round_id,tag_name,train_roc_auc,train_f1,train_precision,train_recall,train_cr,train_pos_rate,test_roc_auc,test_f1,test_precision,test_recall,test_cr,test_pos_rate
0,1,lock,0.618037,0.5,0.0,0.0,precision recall f1-score ...,0.13,0.572245,0.5,0.0,0.0,precision recall f1-score ...,0.139467
1,1,video_doorbell,1.0,0.994186,0.933333,1.0,precision recall f1-score ...,0.14,0.945747,0.924933,0.57529,0.955128,precision recall f1-score ...,0.129892
2,1,gate,0.991815,0.988095,0.888889,1.0,precision recall f1-score ...,0.16,0.8623,0.835746,0.788732,0.708861,precision recall f1-score ...,0.164446
3,1,lights,0.412632,0.5,0.0,0.0,precision recall f1-score ...,0.05,0.389951,0.5,0.0,0.0,precision recall f1-score ...,0.059534
4,1,intercom,0.5625,0.5,0.0,0.0,precision recall f1-score ...,0.04,0.651603,0.5,0.0,0.0,precision recall f1-score ...,0.049542
5,1,dog,1.0,0.833333,1.0,0.666667,precision recall f1-score ...,0.03,0.537721,0.550928,0.8125,0.103175,precision recall f1-score ...,0.052456
6,1,guard_ID_resident,0.59447,0.5,0.0,0.0,precision recall f1-score ...,0.07,0.546304,0.5,0.0,0.0,precision recall f1-score ...,0.042048
7,1,security_screen_door,0.956522,0.932065,0.875,0.875,precision recall f1-score ...,0.08,0.764874,0.792365,0.653465,0.6,precision recall f1-score ...,0.045795
8,1,warning_sign,0.917774,0.875415,0.785714,0.785714,precision recall f1-score ...,0.14,0.740556,0.67725,0.672,0.373333,precision recall f1-score ...,0.093672
9,1,security_camera_system,0.744898,0.5,0.0,0.0,precision recall f1-score ...,0.02,0.566984,0.5,0.0,0.0,precision recall f1-score ...,0.029142


In [134]:
full_test_project.round_results[1]['lock']

{'train_roc_auc': 0.6180371352785146,
 'train_f1': 0.5,
 'train_precision': 0.0,
 'train_recall': 0.0,
 'train_cr': '              precision    recall  f1-score   support\n\n           0       0.87      1.00      0.93        87\n           1       0.00      0.00      0.00        13\n\n    accuracy                           0.87       100\n   macro avg       0.43      0.50      0.47       100\nweighted avg       0.76      0.87      0.81       100\n',
 'train_pos_rate': 0.13,
 'test_roc_auc': 0.5722454490970401,
 'test_f1': 0.5,
 'test_precision': 0.0,
 'test_recall': 0.0,
 'test_cr': '              precision    recall  f1-score   support\n\n           0       0.86      1.00      0.93      2067\n           1       0.00      0.00      0.00       335\n\n    accuracy                           0.86      2402\n   macro avg       0.43      0.50      0.46      2402\nweighted avg       0.74      0.86      0.80      2402\n',
 'test_pos_rate': 0.13946711074104912}

In [148]:
flatten_result_dict = {}
flatten_result_dict['round_id'] = []
flatten_result_dict['tag_name'] = []

for round_id, round_result in full_test_project.round_results.items():
    for tag_name, model_scores in round_result.items():
        flatten_result_dict['round_id'].append(round_id)
        flatten_result_dict['tag_name'].append(tag_name)
        for metric_name, metric_value in model_scores.items():
            if metric_name not in flatten_result_dict.keys():
                flatten_result_dict[metric_name] = [metric_value]
            else:
                flatten_result_dict[metric_name].append(metric_value)
flatten_result_df = pd.DataFrame(flatten_result_dict)

In [149]:
flatten_result_df

Unnamed: 0,round_id,tag_name,train_roc_auc,train_f1,train_precision,train_recall,train_cr,train_pos_rate,test_roc_auc,test_f1,test_precision,test_recall,test_cr,test_pos_rate
0,1,lock,0.618037,0.5,0.0,0.0,precision recall f1-score ...,0.13,0.572245,0.5,0.0,0.0,precision recall f1-score ...,0.139467
1,1,video_doorbell,1.0,0.994186,0.933333,1.0,precision recall f1-score ...,0.14,0.945747,0.924933,0.57529,0.955128,precision recall f1-score ...,0.129892
2,1,gate,0.991815,0.988095,0.888889,1.0,precision recall f1-score ...,0.16,0.8623,0.835746,0.788732,0.708861,precision recall f1-score ...,0.164446
3,1,lights,0.412632,0.5,0.0,0.0,precision recall f1-score ...,0.05,0.389951,0.5,0.0,0.0,precision recall f1-score ...,0.059534
4,1,intercom,0.5625,0.5,0.0,0.0,precision recall f1-score ...,0.04,0.651603,0.5,0.0,0.0,precision recall f1-score ...,0.049542
5,1,dog,1.0,0.833333,1.0,0.666667,precision recall f1-score ...,0.03,0.537721,0.550928,0.8125,0.103175,precision recall f1-score ...,0.052456
6,1,guard_ID_resident,0.59447,0.5,0.0,0.0,precision recall f1-score ...,0.07,0.546304,0.5,0.0,0.0,precision recall f1-score ...,0.042048
7,1,security_screen_door,0.956522,0.932065,0.875,0.875,precision recall f1-score ...,0.08,0.764874,0.792365,0.653465,0.6,precision recall f1-score ...,0.045795
8,1,warning_sign,0.917774,0.875415,0.785714,0.785714,precision recall f1-score ...,0.14,0.740556,0.67725,0.672,0.373333,precision recall f1-score ...,0.093672
9,1,security_camera_system,0.744898,0.5,0.0,0.0,precision recall f1-score ...,0.02,0.566984,0.5,0.0,0.0,precision recall f1-score ...,0.029142


## Run the final function from `ui_utils` and save results

In [155]:
import ui_utils
from pathlib import Path
import pickle

In [156]:
project_path = './projects/security_5x100/'

In [159]:
print(">>> Loading project outputs...")
parsed_project = MetaProjectWithRounds(
    project_path=project_path)

print(">>> Output processed. Saving flatten model metrics to project folder")
analytics_dir = f"{project_path.rstrip('/')}/analytics/"
Path(analytics_dir).mkdir(parents=True, exist_ok=True)
parsed_project.flatten_result_df.to_csv(f"{analytics_dir.rstrip('/')}/modeling_metrics.csv", index=False)

print(">>> Saving loaded output object to project folder")
with open(f"{analytics_dir.rstrip('/')}/metaproject.pickle", 'wb') as file:
    pickle.dump(parsed_project, file)

>>> Loading project outputs...
>>> Instantiate MetaProject class...
>>> Loading project orchestration config
>>> Parsing project execution run time
>>> Reading full dataset...
>>> Project ran as simulation...
>>> Loading results for each round...
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_1
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_2
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_3
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_4
/home/riversome/Documents/NLP_other_specify_classification/projects/security_5x100/round_5
>>> Output processed. Saving flatten model metrics to project folder
>>> Saving loaded output object to project folder
