In [13]:
import os
import json
import subprocess
import pickle

from sklearn.utils.estimator_checks import check_estimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

import ReadData
import cString2json as cString2json
import json2graphNoLeafEdgesWithSplitValues as json2graphNoLeafEdgesWithSplitValues
from fitModels import fitModels
import DecisionSnippetFeatures as DecisionSnippetFeatures
import pruning
import Forest
import datetime
from util import writeToReport
import numpy as np
# %% Parameters. 

dataPath = "../data/"
forestsPath = "../tmp/forests/"
snippetsPath = "../tmp/snippets/"
resultsPath = "../tmp/results/"
reportsPath = "../tmp/reports/"

# current valid options are ['sensorless', 'satlog', 'mnist', 'magic', 'spambase', 'letter', 'bank', 'adult', 'drinking']
dataSet = 'satlog'
# dataSet = 'adult'
# dataSet = 'drinking'

# possible forest_types ['RF', 'DT', 'ET']
forest_types = ['RF']
forest_depths = [5,10,15,20]
sigma_values = [0.0,0.1,0.2,0.3]
#forest_depths = [5, 10, 15, 20]
forest_size = 25

maxPatternSize = 6
minThreshold = 2
maxThreshold = 25

scoring_function = 'accuracy'

# learners that are to be used on top of Decision Snippet Features
learners = {'DSF_NB': MultinomialNB,
            'DSF_SVM': LinearSVC, 
            'DSF_LR': LogisticRegression}

# specify parameters that are given at initialization
learners_parameters = {'DSF_NB': {},
                       'DSF_SVM': {'max_iter': 10000},
                       'DSF_LR': {'max_iter': 1000}}


run_fit_models = True
run_mining = True
run_training = True
run_eval = True

X_train, Y_train = ReadData.readData(dataSet, 'train', dataPath)
X_test, Y_test = ReadData.readData(dataSet, 'test', dataPath)
X = X_train

report_model_dir = reportsPath+'/'+dataSet 
report_file = report_model_dir + '/report.txt'
if not os.path.exists(report_model_dir):
    os.makedirs(report_model_dir)
    
    
def dsf_transform(snippets_file, X):
    with open(snippets_file, 'r') as f_decision_snippets:

        # load decision snippets and create decision snippet features
        frequentpatterns = json.load(f_decision_snippets)
        dsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(
            map(lambda x: x['pattern'], frequentpatterns))
        fts = dsf.fit_transform(X)

        # transform to onehot encodings for subsequent processing by linear methods
        categories = dsf.get_categories()
        fts_onehot_sparse = OneHotEncoder(
            categories=categories).fit_transform(fts)
        fts_onehot = fts_onehot_sparse.toarray()

        return fts_onehot

start_training_total = datetime.datetime.now()   

if run_training:
    print('\n\nFEEL FREE TO IGNORE THIS OUTPUT\n')

    results_list = list()

    # save results list
    if not os.path.exists(os.path.join(resultsPath, dataSet)):
        os.makedirs(os.path.join(resultsPath, dataSet))

    def train_model_on_top(model, fts_onehot, Y_train, scoring_function, model_name, descriptor, scaling=False):

        if scaling:
            model = Pipeline([('scaler', StandardScaler()), (model_name, model)])

        fts_onehot_nb_cv_score = cross_val_score(model, fts_onehot, Y_train, cv=5, scoring=scoring_function)

        dsf_score = fts_onehot_nb_cv_score.mean()
        dsf_std = fts_onehot_nb_cv_score.std()
        print(f'{model_name} {descriptor} {dsf_score} +- {dsf_std}')
        writeToReport(report_file, str(model_name) + '\t' + str(descriptor) + '\t' + str(dsf_score)  + ' +- ' + str(dsf_std))
        model.fit(fts_onehot, Y_train)
        return dsf_score, model, fts_onehot_nb_cv_score

    # train several models on the various decision snippet features
    # store all xval results on traning data in a list
    for graph_file in filter(lambda x: x.endswith('.json'), os.listdir(os.path.join(snippetsPath, dataSet))):
        
        start_training = datetime.datetime.now()
        # get Decision Snippet Features
        fts_onehot = dsf_transform(os.path.join(snippetsPath, dataSet, graph_file), X_train)
       
        # train models
        for model_type, model_class in learners.items():
            xval_score, learner_model, xval_results = train_model_on_top(model_class(**learners_parameters[model_type]), fts_onehot, Y_train, scoring_function, model_type, graph_file)
            results_list.append((xval_score, model_type, graph_file, learner_model, xval_results))
            # cleanup
            xval_score, learner_model, xval_results = None, None, None

        # dump after each decision snippet
        with open(os.path.join(resultsPath, dataSet, "training_xval.pkl"), 'wb') as f_pickle:
            pickle.dump(results_list, f_pickle)
            
            
        end_training = datetime.datetime.now()
        training_time = (end_training - start_training)
        print('Training Time for '+ graph_file +' : '+str(training_time)) 
        writeToReport(report_file, 'Training Time for '+ graph_file +' : '+str(training_time))    
    

end_training_total = datetime.datetime.now()
training_total_time = (end_training_total - start_training_total)
print('Total Training Time: '+str(training_total_time))  
writeToReport(report_file, 'Total Training Time: '+str(training_total_time) + '\n')




FEEL FREE TO IGNORE THIS OUTPUT

DSF_NB RF_5_t2_1.json 0.42998872604284105 +- 0.010199037544891675
DSF_SVM RF_5_t2_1.json 0.42998872604284105 +- 0.010199037544891675
DSF_LR RF_5_t2_1.json 0.42998872604284105 +- 0.010199037544891675
Training Time for RF_5_t2_1.json : 0:00:00.595568
DSF_NB RF_5_t2_10_3_6.json 0.7767756482525366 +- 0.056211728804139396
DSF_SVM RF_5_t2_10_3_6.json 0.8060879368658398 +- 0.044150175076655855
DSF_LR RF_5_t2_10_3_6.json 0.8013528748590757 +- 0.04194396936488359
Training Time for RF_5_t2_10_3_6.json : 0:00:06.271623
Total Training Time: 0:00:06.869131


In [15]:
x = 5
y = pow(5,179)
print(y % 56160)

43805
