In [24]:
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [25]:
def check_score(model, x_train, x_test, y_train, y_test):
    if model == None:
        return False
    y_predict = model.predict(x_test)        
    _model_score = model.score(x_train, y_train)
    _accuracy_score = accuracy_score(y_test, y_predict)
    _f1_score = f1_score(y_test, y_predict, average='binary')        
    _precision_score = precision_score(y_test, y_predict)
    _recall_score = recall_score(y_test, y_predict)
    score_result = {
        "model_score": _model_score,
        "accuracy_score": _accuracy_score,
        "f1_score": _f1_score,
        "precision_score": _precision_score,
        "recall_score": _recall_score            
    }
    
    return score_result

In [26]:
features_name = [
    "local_total", "foreign_total", "monthly_trade_vol", "stock_inventory_val",
    "invest_type", "age", "KPI", "region", 
    "fund_type", "AUM", "local_or_foreign", "count_currency",
    "chosen", "guaranteed", "realized_gain_loss", "unrealized_gain_loss",    
    'local_demand_deposit', 'local_fixed_deposit', 'deduction_num', 'value', 'value_diff'
]

In [27]:
datanum = 100000
df_dict = {}
df_dict["local_total"] = [ random.randint(100, 10000000000)  for _ in range(datanum) ]
df_dict["foreign_total"] = [ random.randint(50,20000000000) for _ in range(datanum) ]
df_dict["monthly_trade_vol"] = [ random.randint(0,500000000) for _ in range(datanum) ]
df_dict["stock_inventory_val"] = [ random.randint(0,20000000) for _ in range(datanum) ]
df_dict["invest_type"] = [ random.randint(0,6000000) for _ in range(datanum) ]
df_dict["age"] = [ random.randint(0,100) for _ in range(datanum) ]
df_dict["KPI"] = [ random.randint(1,4500) for _ in range(datanum) ]
df_dict["region"] = [ random.randint(1,248) for _ in range(datanum) ]
df_dict["fund_type"] = [ random.randint(1,8) for _ in range(datanum) ]
df_dict["guaranteed"] = [ random.randint(0,1) for _ in range(datanum) ]
df_dict["realized_gain_loss"] = [ random.randint(0,15) for _ in range(datanum) ]
df_dict["unrealized_gain_loss"] = [ random.randint(0,20000) for _ in range(datanum) ]
df_dict["local_demand_deposit"] = [ random.randint(0,20000) for _ in range(datanum) ]
df_dict["local_fixed_deposit"] = [ random.randint(0,20000) for _ in range(datanum) ]
df_dict["deduction_num"] = [ random.randint(0,20000) for _ in range(datanum) ]
df_dict["value"] = [ random.randint(0,700) for _ in range(datanum) ]
df_dict["value_diff"] = [ random.randint(0,755) for _ in range(datanum) ]

In [28]:
df = pd.DataFrame(df_dict)

In [29]:
df['rule_value'] =  ((df.local_total / df.value) + (df.local_demand_deposit ** 2) - (df.guaranteed ** 1.5 ** df.value_diff)) /df.foreign_total + random.random()
df['rule_value'] = (df['rule_value'] > 0.097) & (df['rule_value'] < 1.2).astype(int)

In [30]:
x = df.drop(columns=['rule_value'])
y = df['rule_value']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
clf = GradientBoostingClassifier(n_estimators=200, max_depth=3, random_state=42, verbose=1)
clf.fit(x_train, y_train)
check_score(clf, x_train, x_test, y_train, y_test)

      Iter       Train Loss   Remaining Time 
         1           0.0917           46.44s
         2           0.0826           45.26s
         3           0.0759           45.01s
         4           0.0694           45.04s
         5           0.0650           44.60s
         6           0.0607           44.30s
         7           0.0562           43.96s
         8           0.0526           43.57s
         9           0.0496           43.28s
        10           0.0470           43.02s
        20           0.0305           40.47s
        30           0.0228           38.04s
        40           0.0187           35.83s
        50           0.0161           33.57s
        60           0.0145           31.31s
        70           0.0131           29.05s
        80           0.0120           26.82s
        90           0.0113           24.58s
       100           0.0105           22.33s
       200           0.0062            0.00s


{'model_score': 0.9995625,
 'accuracy_score': 0.99815,
 'f1_score': 0.999056483488461,
 'precision_score': 0.9988272486232919,
 'recall_score': 0.9992858235984288}

In [31]:
clf_tree = DecisionTreeClassifier(
    max_features=14, 
    max_depth=3, 
    min_samples_split=10, 
    min_samples_leaf=10)
clf_tree.fit(x_train, y_train)
check_score(clf_tree, x_train, x_test, y_train, y_test)

{'model_score': 0.9919125,
 'accuracy_score': 0.9912,
 'f1_score': 0.9955179790159926,
 'precision_score': 0.9939486397152301,
 'recall_score': 0.997092281793603}

In [None]:
clf_tree = DecisionTreeClassifier(
    max_features=14, 
    max_depth=3, 
    min_samples_split=10, 
    min_samples_leaf=10)
clf_tree.fit(x_train, y_train)
check_score(clf_tree, x_train, x_test, y_train, y_test)