In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

pd.set_option('display.max_columns', 500)

In [2]:
%%time

cols_to_drop = ["P_SUM", "P_MESS_DATE", "P_USERNAME", "P_ORIG_OPER_REASON", "P_BANKOPERATIONID", "P_DATE_UPDATE", 
         "P_MESS_STATUS", "P_OPERATIONEXTRAINFO", "P_OPERATIONREASON", "P_OPERATION_LIST", "P_MESS_NUMBER"]

df = pd.read_csv("/home/shared_files/alfa/susp_ops.csv")
df.drop(cols_to_drop, axis=1, inplace=True)
df = df.loc[df.P_SENDTOKFMBOOL.isin([1, 2, 7, 8])]
df["label"] = df.P_SENDTOKFMBOOL.isin([1, 5, 7, 8])



CPU times: user 5.06 s, sys: 769 ms, total: 5.83 s
Wall time: 5.83 s


In [6]:
from itertools import chain

def find_rules_for_stc(df, column_list, stc):
    rules = []
    
    for col in column_list:
        groupped = df.groupby(col).label.agg(["size", "mean", "sum"])
        candidates = groupped[(groupped["mean"] < 0.0001) | (groupped["mean"] > 0.9999)]
        for col_value, row in candidates.iterrows():
            rules.append({"col":col, "col_value":col_value, "mean":int(row["mean"]), "size":row["size"],
                          "all_size":df.shape[0], "cover_frac":row["size"]/df.shape[0],
                          "stc":stc})
    return sorted(rules, key=lambda x: x["size"], reverse=True)[:5]

def find_rules(df, column_list):
    rules = {}
    for stc in df.P_SUSPICIOUSTYPECODE.unique():
        stc_df = df[df.P_SUSPICIOUSTYPECODE == stc]
        if stc_df.shape[0] < 1000:
            continue
        
        stc_rules = find_rules_for_stc(stc_df, column_list, stc)
        rules[stc] = stc_rules
        
    return rules

In [7]:
to_look_into = ["P_BRANCH", "P_CURRENCYCODE", "P_EKNPCODE", "P_DOCCATEGORY", "P_SUSPIC_KIND",
                "P_CRITERIAFIRST", "P_CRITERIASECOND"]

rules = find_rules(df, to_look_into)        
stats = pd.DataFrame(list(chain.from_iterable(rules.values())))

Рассмотрим полученные правила. В колонке `cover_frac` указывается доля покрытых данным правилом случаев внутри данного критерия. В колонке `size` указывается абсолютное количество кейсов, попадающих под данное правило

In [10]:
stats.sort_values(by="size", ascending=False).head(10)

Unnamed: 0,all_size,col,col_value,cover_frac,mean,size,stc
55,203373,P_CRITERIAFIRST,201.0,0.738259,0,150142.0,20.0
56,203373,P_EKNPCODE,390.0,0.420366,0,85491.0,20.0
57,203373,P_CRITERIAFIRST,202.0,0.029217,0,5942.0,20.0
58,203373,P_EKNPCODE,911.0,0.013615,0,2769.0,20.0
59,203373,P_EKNPCODE,213.0,0.007277,0,1480.0,20.0
25,72651,P_BRANCH,4305.0,0.015237,0,1107.0,1671.0
75,36018,P_BRANCH,2805.0,0.024988,1,900.0,521.0
76,36018,P_BRANCH,2700.0,0.016436,1,592.0,521.0
70,3176,P_BRANCH,2765.0,0.185453,1,589.0,221.0
50,1873,P_BRANCH,2765.0,0.307528,1,576.0,211.0


In [12]:
alfa_rules = {20: [("P_CRITERIAFIRST", 201),
                   ("P_EKNPCODE", 390),
                   ("P_CRITERIAFIRST", 202)]}

def check_fp(x, rules):
    stc = x["P_SUSPICIOUSTYPECODE"]
    if stc not in rules:
        return False
    for col, value in rules[stc]:
        if x[col] == value:   
            return True
    return False

df["mask"] = df.apply(check_fp, axis=1, args=(alfa_rules,))

In [13]:
df[df.label == 0]["mask"].mean()

0.48707064210349

Итого правила, перечисленные в `alfa_rules` позволяют отсеять 48% false positive