In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm_notebook
import sys
import umap
import os

from sklearn.preprocessing import scale
from collections import defaultdict

%matplotlib inline

pd.set_option('display.max_columns', 500)

In [2]:
def sample_data(data):
    proportions = {-1: 85, 0: 10, 1: 5}
    n_size = 20000

    data_chunks = []

    for label in [-1, 0, 1]:
        this_label_rows_num = int(n_size / 100 * proportions[label])
        data_chunks.append(data[data.label == label].sample(this_label_rows_num))

    return pd.concat(data_chunks, axis=0)


def preprocess_for_mapper(data):
    NOT_USED_IN_MAPPER = ["P_OFFLINEOPERATIONID", "P_DOCCATEGORY", "P_EKNPCODE", "P_CURRENCYCODE", 
                          "operation_id", "label", "fp_model_prediction"]
    
    fp_model_prediction = data.fp_model_prediction
    data_operation_ids = data.operation_id
    data = data.drop(NOT_USED_IN_MAPPER, axis=1)
    data = pd.DataFrame(data=scale(data), columns=data.columns)
    
    return data, fp_model_prediction, data_operation_ids

In [4]:
data = pd.read_csv("/home/evgenii/tda/data_full.csv")
data_sample = sample_data(data)
data_sample, fp_model_prediction, data_operation_ids = preprocess_for_mapper(data_sample)



In [5]:
import kmapper as km
from sklearn.cluster import DBSCAN

mapper = km.KeplerMapper(verbose=1)
res = mapper.map(fp_model_prediction, data_sample, DBSCAN(eps=1), nr_cubes=10)
_ = mapper.visualize(res, path_html="make_circles_keplermapper_output.html", color_function=1-fp_model_prediction)

Mapping on data shaped (20000, 9) using lens shaped (20000,)

Creating 10 hypercubes.

Created 8 edges and 22 nodes in 0:00:04.315893.
Wrote visualization to: make_circles_keplermapper_output.html


  color_function = color_function.reshape(-1, 1)


In [6]:
def load_section_features():
    path_to_shared_data = 'home/shared_files/'
    path_to_baseamount_data = '/home/const.belev/spring_2018/beconstant/notebooks/datasets/baseamount_agg_mean_count_std_week/'
    path_to_graph_data = '/home/const.belev/spring_2018/beconstant/notebooks/datasets/transaction_graph/'
    path_to_agg_selection_models = '/home/const.belev/spring_2018/beconstant/notebooks/saved_models/agg_feature_selection/'

    with open(os.path.join(path_to_graph_data, 'graph_trans_df.pkl'), 'rb') as handle:
        graph_trans_df = pickle.load(handle)

    with open(os.path.join(path_to_baseamount_data, 'colnames_source_in.pkl'), 'rb') as handle:
        colnames_source_in = pickle.load(handle)

    with open(os.path.join(path_to_baseamount_data, 'colnames_source_out.pkl'), 'rb') as handle:
        colnames_source_out = pickle.load(handle)

    with open(os.path.join(path_to_baseamount_data, 'colnames_target_in.pkl'), 'rb') as handle:
        colnames_target_in = pickle.load(handle)

    with open(os.path.join(path_to_baseamount_data, 'colnames_target_out.pkl'), 'rb') as handle:
        colnames_target_out = pickle.load(handle)

    colnames_all = np.array(colnames_source_in + colnames_source_out + colnames_target_in + colnames_target_out)
    baseamount_agg_sparse = load_npz(os.path.join(path_to_baseamount_data, 'baseamount_agg_sparse.npz')).tocsr()
    
    return baseamount_agg_sparse, colnames_all

In [10]:
%%time
from scipy.sparse import load_npz

cleaned_off_ops = pd.read_csv("/home/const.belev/spring_2018/beconstant/notebooks/datasets/preprocessed/off_ops.csv")
section_features, section_colnames = load_section_features()

with open("/home/evgenii/tda/operation_ids.pkl", "rb") as f:
    operation_ids = pd.Series(pickle.load(f))



CPU times: user 2min 12s, sys: 29.1 s, total: 2min 41s
Wall time: 2min 45s


In [11]:
OFFLINE_OPS_FEATURES_TO_TAKE = ["P_CURRENCYCODE", "P_BASEAMOUNT", "P_CURRENCYAMOUNT", "P_EKNPCODE", "P_DOCCATEGORY"]

offline_operation_features = cleaned_off_ops[OFFLINE_OPS_FEATURES_TO_TAKE]

In [12]:
def get_features_for_indices(indices):
    this_op_ids = set(data_operation_ids.iloc[indices].values)
    indices_mask = operation_ids.isin(this_op_ids)
    
    submatrix = section_features[np.where(indices_mask)[0]]
    sparse_df = pd.SparseDataFrame(data=submatrix, columns=section_colnames).fillna(0)
    offline_operation_features_for_idx = offline_operation_features.loc[indices_mask].fillna(0)
    
    for col in offline_operation_features_for_idx.columns:
        sparse_df[col] = offline_operation_features_for_idx[col].values
        
    return sparse_df

def get_features_for_cluster(cluster_name):
    cluster_nodes = res["nodes"][cluster_name]
    
    non_cluster_size = min(data_sample.shape[0]-len(cluster_nodes), 2 * len(cluster_nodes))
    non_cluster_nodes = np.random.choice(list(set(range(data_sample.shape[0])) - set(cluster_nodes)), replace=False, size=non_cluster_size)

    cluster_features = get_features_for_indices(cluster_nodes)
    non_cluster_features = get_features_for_indices(non_cluster_nodes)
    
    return cluster_features, non_cluster_features

cluster_features, non_cluster_features = get_features_for_cluster("cube9_cluster1")

In [13]:
def add_sum_columns(data):
    for col in filter(lambda x: "mean" in x, data.columns):
        cnt_col = col.replace("mean", "count")
        sum_col = col.replace("mean", "sum")

        data[sum_col] = pd.SparseSeries(data[col].to_dense() * data[cnt_col].to_dense())

def remove_unnecessary_columns(data):
    COLS_MASKS_TO_DROP = [":std:"]
    cols_to_drop = list(filter(lambda x: any(mask in x for mask in COLS_MASKS_TO_DROP), data.columns))
    data.drop(cols_to_drop, axis=1, inplace=True)
    
def process_columns(data):
    #add_sum_columns(data)
    remove_unnecessary_columns(data)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import ks_2samp
from statsmodels.sandbox.stats.multicomp import multipletests

def get_importances(cluster_features, non_cluster_features):
    labels = np.hstack([np.ones_like(cluster_features.index), np.zeros_like(non_cluster_features.index)])
    data = pd.concat([cluster_features, non_cluster_features], axis=0)
    
    rf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
    rf.fit(data, labels)
    
    return sorted(zip(cluster_features.columns, rf.feature_importances_), key=lambda x: x[1], reverse=True)

def get_ks_scores(cluster_features, non_cluster_features):
    ks_pvalues = []
    ks_scores = []

    for col in cluster_features.columns:
        ks_res = ks_2samp(cluster_features[col].to_dense(), non_cluster_features[col].to_dense())
        ks_pvalues.append(ks_res.pvalue)
        ks_scores.append(ks_res.statistic)
    fixed_pvalues = multipletests(ks_pvalues)
    
    return list(zip(cluster_features, ks_scores, fixed_pvalues[1], fixed_pvalues[0]))

In [32]:
from IPython.display import display, HTML

def get_feature_description(name, cluster_features, non_cluster_features):
    bigger = cluster_features[name].mean() > non_cluster_features[name].mean()
    relation_description = "bigger" if bigger else "lesser"
    
    tokens = name.split(":")
    
    if len(tokens) == 1:
        return "{name} is {relation} than usual".format(name=name, relation=relation_description)
    
    type_description = "Sum" if tokens[1] == "sum" else "Number"
    data_section_description = "{0}={1}".format(tokens[0], tokens[2])
    if float(tokens[2]) == -1000:
        data_section_description = "without {0}".format(tokens[0])
    account_role_description = "debit" if tokens[3] == "target" else "credit"
    
    description = "{type} of transactions with {section} in {role} account is {relation} that usual".format(
        type=type_description, section=data_section_description, role=account_role_description,
        relation=relation_description)
    
    return description

def _get_differences(cluster_features, non_cluster_features):
    imps = get_importances(cluster_features, non_cluster_features)
    #ks_res = get_ks_scores(cluster_features, non_cluster_features)
    
    samples = []
    for name, imp in imps[:5]:
        description = get_feature_description(name, cluster_features, non_cluster_features)
        samples.append({"description": description, "score": imp * 100})
    return pd.DataFrame(samples)

def explain_cluster(cluster_name, min_samples_to_explain=30):
    cluster_features, non_cluster_features = get_features_for_cluster(cluster_name)
    
    if cluster_features.shape[0] < min_samples_to_explain:
        return None
    
    process_columns(cluster_features)
    process_columns(non_cluster_features)
    
    return _get_differences(cluster_features, non_cluster_features)

In [16]:
import IPython

def display_tda(path_html):
    iframe = '<iframe src=' + path_html \
            + ' width=100%% height=800 frameBorder="0"></iframe>'
    IPython.core.display.display(IPython.core.display.HTML(iframe))

In [17]:
clusters = res["nodes"].items()

for name, samples in clusters:
    print(name, len(samples))
    
display_tda("make_circles_keplermapper_output.html")

cube0_cluster0 15580
cube0_cluster1 375
cube0_cluster2 142
cube0_cluster3 13
cube1_cluster0 2768
cube2_cluster0 12
cube3_cluster0 6
cube4_cluster0 14
cube5_cluster0 42
cube6_cluster0 42
cube7_cluster0 69
cube8_cluster0 241
cube8_cluster1 6
cube9_cluster0 600
cube9_cluster1 66
cube9_cluster2 20
cube9_cluster3 5
cube9_cluster4 25
cube9_cluster5 5
cube9_cluster8 5
cube9_cluster7 5
cube9_cluster6 6


In [21]:
%%time
cluster_diffs = {name:explain_cluster(name) for name, _ in clusters}

CPU times: user 2min 32s, sys: 9.66 s, total: 2min 42s
Wall time: 1min 38s


In [20]:
from IPython.display import display, HTML

with pd.option_context('display.max_colwidth', 500, "display.float_format", "{0:.2f}".format):
    display(cluster_diffs["cube9_cluster1"])

Unnamed: 0,description,score
0,Number of transactions with without P_EKNPCODE in credit account is bigger that usual,2.74
1,Number of transactions with P_EKNPCODE=681 in credit account is bigger that usual,2.12
2,P_CURRENCYAMOUNT is bigger than usual,1.86
3,Number of transactions with P_DOCCATEGORY=10 in credit account is bigger that usual,1.79
4,Number of transactions with P_EKNPCODE=490 in credit account is bigger that usual,1.78
