In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import math
from evaluation_metrics import entropy

import sys
import logging
import time

import seaborn
import matplotlib.pyplot as plt

from copy import deepcopy
from typing import Any, List, Optional, Union
from multiprocessing import Manager, Process, Queue, cpu_count

from util import downsize
from evaluation_metrics import metrics, cleanup
from description import Description
from subgroup import Subgroup
from beam import Beam
from workers import evaluate_subgroups, beam_adder

In [2]:
## GENERATE DATA ##
datasize = 2000
randomvariables = 9
## GENERATE DATA ##

predictor = list(np.random.normal(10,1,datasize))
errorsd = 0.5
result = []

# we doctor our variables in a non random way
v1 = [0,1,0,1,0]*int(datasize/5)
v2 = [0]*int(datasize/5*3) + [1]*int(datasize/5*2)


noisevars = [list(np.random.binomial(1,0.4,datasize)) for _ in range(randomvariables-2)]

variables = [v1, v2] +noisevars
# generate result;
# result data where first two variables are both 1 is different
for i in range(datasize):
    if variables[0][i] == 1 and variables[1][i] == 1:
        result.append(16* predictor[i]  + np.random.normal(0,errorsd) )
    # elif variables[0][i] == 1 and variables[2][i] == 1:
    #     result.append(4 * predictor[i]   + np.random.normal(0,errorsd) )
    else:
        result.append(10* predictor[i]  + np.random.normal(0,errorsd) )


# create a dataframe with number i as column title with the before generated columns
df = pd.DataFrame({i:ls for i,ls in enumerate(variables)})

df['result'] = result
df['predictor'] = predictor

In [3]:
# # JACCARD EMM INIT
# width=20
# depth=2
# evaluation_metric='regression'
# n_jobs=-1
# log_level=1

# #defaults:
# strategy = 'maximize'
# n_bins = 10
# bin_strategy = 'equidepth'
# candidate_size = None
# log_level=50

# depth = depth
# evaluation_metric = evaluation_metric
# if n_jobs == -1:
#     n_jobs = cpu_count()
# else:
#     n_jobs = min(n_jobs, cpu_count())
# if hasattr(evaluation_metric, '__call__'):
#     evaluation_function = evaluation_metric
# else:
#     try:
#         evaluation_function = metrics[evaluation_metric]
#     except KeyError:
#         raise ValueError(f"Nu such metric: {evaluation_metric}")
# settings = dict(
#     strategy=strategy,
#     width=width,
#     n_bins=n_bins,
#     bin_strategy=bin_strategy,
#     candidate_size=candidate_size
# )
# beam = None
# target_columns = None
# dataset_target = None
# dataset = None

In [4]:
# EMM INIT
class EMM():
    def __init__(self, width, depth, evaluation_metric='regression', strategy = 'maximize',
                 n_bins = 10, bin_strategy = 'equidepth', candidate_size = None, log_level=1 ) -> None:
        """Initialization for the beam search exceptional model mining procedure"""
        logging.basicConfig(filename=None, level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
        # removed the n_jobs from code, because we removed the multicore processing
        if hasattr(evaluation_metric, '__call__'):
            self.evaluation_function = evaluation_metric
        else:
            try:
                self.evaluation_function = metrics[evaluation_metric]
            except KeyError:
                raise ValueError(f"Nu such metric: {evaluation_metric}")
        self.settings = dict(
            strategy=strategy,
            width=width,
            n_bins=n_bins,
            bin_strategy=bin_strategy,
            candidate_size=candidate_size
        )
        self.beam = None
        self.target_columns = None
        self.dataset_target = None
        self.dataset = None
        self.depth = depth
    
    def set_data(self, data:pd.Dataframe, target_cols):
        """This method takes a dataset and prepares it for the beam search"""
        logging.info("Start")
        self.data, translations = downsize(deepcopy(data))
        self.settings['object_cols'] = translations
        dataset = Subgroup(data, Description('all'))
        _, dataset.target = self.evaluation_function(data[target_cols], data[target_cols])
        self.regressioncache = dataset.target
        self.beam = Beam(dataset, self.settings)
        target_cols = list(target_cols,)
        descriptive_cols = [c for c in data.columns if c not in target_cols]
        if any(c not in data.columns for c in descriptive_cols + target_cols):
            raise ValueError("All specified columns should be present in the dataset")
        self.dataset_target = data[target_cols]
        self.target_columns = target_cols
    
    def subgroupify(self):
        subgroups = []
        for subgroup in self.beam.subgroups:
                for col in self.descriptive_cols:
                    newgroups = create_subgroup_lists(subgroup, col, self.settings)
                    subgroups = subgroups + newgroups
        self.candidates = subgroups
    
    def calc_score(self, p = True):
        for candidate in self.candidates:
            candidate_target = candidate.data[self.target_columns]
            candidate.score, candidate.target = regression(candidate_target, self.dataset_target, comparecache=self.regressioncache)
            self.beam.add(candidate)
        self.beam.select_cover_based()
        if p== True:
            self.beam.print()
        else:
            logging.info("finished an iteration")
    
    def increase_depth(self,iterations = 1):
        for _ in range(iterations):
            self.subgroupify()
            self.calc_score(p=False)
        self.beam.print()

In [1]:
def regression(subgroup_target, dataset_target, comparecache, use_complement=False):
    if len(subgroup_target) < 20: # less than 20 rows is not enough to build on.
        return 0, None
    if len(subgroup_target.columns) != 2:
        raise ValueError("Correlation metric expects exactly 2 columns as target variables")
    x_col, y_col = list(subgroup_target.columns)
    est = sm.OLS(subgroup_target[y_col], subgroup_target[x_col])
    est = est.fit()
    coef = est.summary2().tables[1]['Coef.'][x_col]
    p = est.summary2().tables[1]['P>|t|'][x_col]
    if math.isnan(p):
        return 0, 0
    if (1 - p) < 0.99:
        return 0, 0
    return entropy(subgroup_target, dataset_target) * abs(coef - comparecache), coef

def entropy(subgroup_target, dataset_target):
    n_c = max(1, len(dataset_target) - len(subgroup_target))
    n = len(subgroup_target)
    N = len(dataset_target)
    return -n/N * math.log(n/N) - n_c/N * math.log(n_c/N)

def create_subgroup_lists(subgroup: Subgroup, column: str, settings: dict):
    """This function takes a subgroup and column and makes all the possible subgroup splits on that column
    This is then returned as a list"""
    resultinggroups = []
    if column in subgroup.description:
        return []
    data = subgroup.data
    values = list(data[column].unique())
    if len(values) == 1:  # No need to make a split for a single value
        return []
    if column in settings['object_cols'] or len(values) < settings['n_bins']:
        while len(values) > 0:
            value = values.pop(0)
            subset = data[data[column] == value]
            resultinggroups.append( Subgroup(subset, deepcopy(subgroup.description).extend(column, value)))
    else:  # Float or Int
        if settings['bin_strategy'] == 'equidepth':
            _, intervals = pd.qcut(data[column].tolist(), q=min(settings['n_bins'], len(values)),
                                   duplicates='drop', retbins=True)
        else:
            raise ValueError(f"Invalid bin strategy `{settings['strategy']}`")
        intervals = list(intervals)
        lower_bound = intervals.pop(0)
        while len(intervals) > 0:
                upper_bound = intervals.pop(0)
                subset = data[(data[column] > lower_bound) & (data[column] <= upper_bound)]
                resultinggroups.append( Subgroup(subset, deepcopy(subgroup.description).extend(column, [lower_bound, upper_bound])) )
                lower_bound = upper_bound
    return resultinggroups

NameError: name 'Subgroup' is not defined

In [5]:
# SETTING DATA
target_cols = ['result','predictor']
data = df






2024-10-17 16:30:58,125 - INFO - Start
2024-10-17 16:30:58,134 - INFO - Memory usage before downsizing 117.31 MB
2024-10-17 16:30:58,139 - INFO - Memory usage after downsizing 33.33 MB


In [7]:
print('making subgroups', end='')



making subgroups done


In [9]:
# EVALUATE and add


2024-10-17 16:30:58,385 - DEBUG - --------------------
2024-10-17 16:30:58,386 - DEBUG - 1 = 0 0.008291245956183543 (1200)
2024-10-17 16:30:58,387 - DEBUG - 0 = 0 0.008274994520212456 (1200)
2024-10-17 16:30:58,387 - DEBUG - 0 = 1 0.00769718852873312 (800)
2024-10-17 16:30:58,388 - DEBUG - 1 = 1 0.007609853983744397 (800)
2024-10-17 16:30:58,388 - DEBUG - 3 = 1 0.0006933671099866324 (786)
2024-10-17 16:30:58,389 - DEBUG - 2 = 1 0.0006109023041132762 (812)
2024-10-17 16:30:58,389 - DEBUG - 5 = 1 0.0005381518640466143 (804)
2024-10-17 16:30:58,390 - DEBUG - 3 = 0 0.00046516364092260074 (1214)
2024-10-17 16:30:58,390 - DEBUG - 7 = 1 0.0004419369443377553 (790)
2024-10-17 16:30:58,391 - DEBUG - 2 = 0 0.0004255474017664676 (1188)
2024-10-17 16:30:58,391 - DEBUG - 5 = 0 0.00035221473462178614 (1196)
2024-10-17 16:30:58,391 - DEBUG - 7 = 0 0.0002867031434434397 (1210)
2024-10-17 16:30:58,392 - DEBUG - 4 = 1 0.0001686347946487329 (780)
2024-10-17 16:30:58,392 - DEBUG - 4 = 0 0.0001060936824480

In [10]:

subgroups = []
for subgroup in beam.subgroups:
        for col in descriptive_cols:
            newgroups = create_subgroup_lists(subgroup, col, settings)
            subgroups = subgroups + newgroups
# EVALUATE and add
for subgroup in subgroups:
    subgroup_target = subgroup.data[target_columns]
    subgroup.score, subgroup.target = regression(subgroup_target, dataset_target, comparecache=regressioncache)
    beam.add(subgroup)
beam.select_cover_based()
beam.print()

making subgroups done


2024-10-17 16:31:00,740 - DEBUG - --------------------
2024-10-17 16:31:00,740 - DEBUG - 0 = 1 AND 1 = 1 0.011073179339469363 (320)
2024-10-17 16:31:00,740 - DEBUG - 0 = 0 AND 2 = 0 0.008125899369548757 (748)
2024-10-17 16:31:00,740 - DEBUG - 1 = 0 AND 8 = 0 0.00811624858535081 (737)
2024-10-17 16:31:00,740 - DEBUG - 1 = 0 AND 3 = 0 0.00810906525765092 (738)
2024-10-17 16:31:00,740 - DEBUG - 0 = 0 AND 4 = 0 0.00808145088910405 (735)
2024-10-17 16:31:00,740 - DEBUG - 1 = 0 AND 7 = 0 0.008099322650937086 (733)
2024-10-17 16:31:00,740 - DEBUG - 0 = 0 AND 8 = 0 0.00806677039035524 (730)
2024-10-17 16:31:00,740 - DEBUG - 0 = 0 AND 3 = 0 0.008059772180217724 (726)
2024-10-17 16:31:00,740 - DEBUG - 1 = 0 AND 5 = 0 0.008060981183114926 (720)
2024-10-17 16:31:00,748 - DEBUG - 1 = 0 AND 4 = 0 0.00805223528804073 (721)
2024-10-17 16:31:00,748 - DEBUG - 1 = 0 AND 2 = 0 0.008009663592855602 (707)
2024-10-17 16:31:00,748 - DEBUG - 0 = 0 AND 7 = 0 0.007988196094674434 (704)
2024-10-17 16:31:00,748 - 

In [11]:
beam.decrypt_descriptions(translations)
beam.print()
cleanup()

2024-10-17 16:31:00,758 - DEBUG - --------------------
2024-10-17 16:31:00,762 - DEBUG - 0 = 1 AND 1 = 1 0.011073179339469363 (320)
2024-10-17 16:31:00,762 - DEBUG - 0 = 0 AND 2 = 0 0.008125899369548757 (748)
2024-10-17 16:31:00,763 - DEBUG - 1 = 0 AND 8 = 0 0.00811624858535081 (737)
2024-10-17 16:31:00,763 - DEBUG - 1 = 0 AND 3 = 0 0.00810906525765092 (738)
2024-10-17 16:31:00,764 - DEBUG - 0 = 0 AND 4 = 0 0.00808145088910405 (735)
2024-10-17 16:31:00,764 - DEBUG - 1 = 0 AND 7 = 0 0.008099322650937086 (733)
2024-10-17 16:31:00,765 - DEBUG - 0 = 0 AND 8 = 0 0.00806677039035524 (730)
2024-10-17 16:31:00,765 - DEBUG - 0 = 0 AND 3 = 0 0.008059772180217724 (726)
2024-10-17 16:31:00,766 - DEBUG - 1 = 0 AND 5 = 0 0.008060981183114926 (720)
2024-10-17 16:31:00,767 - DEBUG - 1 = 0 AND 4 = 0 0.00805223528804073 (721)
2024-10-17 16:31:00,767 - DEBUG - 1 = 0 AND 2 = 0 0.008009663592855602 (707)
2024-10-17 16:31:00,768 - DEBUG - 0 = 0 AND 7 = 0 0.007988196094674434 (704)
2024-10-17 16:31:00,769 - 