In [1]:
import azureml.core
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Experiment, Workspace
from azureml.core import Dataset
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
import pickle
from tqdm.notebook import tqdm
import joblib
from sklearn.cluster import KMeans


# Check core SDK version number
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")
print("")

# Log In to Azure ML Workspace
interactive_auth = InteractiveLoginAuthentication(tenant_id="76f90eb1-fb9a-4446-9875-4d323d6455ad")

ws = Workspace.from_config(auth=interactive_auth)
print('Workspace name: ' + ws.name, sep='\n')

You are currently using version 1.5.0 of the Azure ML SDK

Workspace name: team06


In [247]:
# get the dataset from Azure ML Workspace
aml_dataset = Dataset.get_by_name(ws, 'train_ds', version='latest')

df = aml_dataset.to_pandas_dataframe()

In [312]:
def create_dataset(df, train_cluster) :
    df['gender'] = df['gender'].map({'Ж':0, 'М': 1})
    if train_cluster:
        df['group_cat'] = df['group'].map({'test':1, 'control':0})
        df['target'] = 0
        df.loc[(df['group_cat'] == 1) & (df['response_att'] == 1), 'target'] = 1
        df.loc[(df['group_cat'] == 0) & (df['response_att'] == 0), 'target'] = 1

    df['uplift'] = 0


    for name_col in ['cheque_count_12m', 'cheque_count_3m', 'cheque_count_6m', 
                     'k_var_disc_share_15d', 'k_var_disc_share_1m', 'k_var_disc_share_3m', 'k_var_disc_share_6m',
                     'k_var_sku_price_15d', 'k_var_sku_price_1m', 'k_var_sku_price_3m', 'k_var_sku_price_6m',
                     'k_var_count_per_cheque_15d', 'k_var_count_per_cheque_1m',
                     'k_var_count_per_cheque_3m', 'k_var_count_per_cheque_6m',
                    'sale_count_12m', 'sale_count_3m', 'sale_count_6m', 'sale_sum_12m', 'sale_sum_3m', 'sale_sum_6m']:
        need_col = [col for col in df.columns if col.startswith(name_col)]
        df['sum_' + name_col] = df.loc[:, need_col].sum(axis = 1)
        df['max_' + name_col] = df.loc[:, need_col].max(axis = 1)

    kmeans_columns = ['response_viber', 'k_var_days_between_visits_15d', 'max_cheque_count_6m', 'response_sms', 
                       'perdelta_days_between_visits_15_30d', 'max_cheque_count_12m', 'k_var_days_between_visits_1m', 
                       'stdev_days_between_visits_15d', 'sum_k_var_sku_price_6m', 'k_var_days_between_visits_3m']

    if train_cluster:
        km = KMeans(n_clusters = 5, random_state = 1)
        max_kmeans_columns = df[kmeans_columns].abs().max().values
        mean_kmeans_columns = df[kmeans_columns].mean()
        df['kmeans'] = km.fit_predict( df[kmeans_columns].fillna(mean_kmeans_columns) /  max_kmeans_columns)
        
        groupby_dict_mean = {}
        for col in kmeans_columns: 
            groupby_dict_mean[col] = df.groupby(['kmeans'])[col].mean()
            df['gp_diff_' + col] = df[col] - df['kmeans'].map( groupby_dict_mean[col]  )
            
        with open('kmeans.pickle', 'wb') as f:
            pickle.dump([max_kmeans_columns, mean_kmeans_columns, km, groupby_dict_mean], f)
            
        
        
    else:
        with open('kmeans.pickle', 'rb') as f:
            max_kmeans_columns, mean_kmeans_columns, km, groupby_dict_mean = pickle.load(f)
        df['kmeans'] = km.predict( df[kmeans_columns].fillna(mean_kmeans_columns) /  max_kmeans_columns)


        for col in kmeans_columns: 
            df['gp_diff_' + col] = df[col] - df['kmeans'].map( groupby_dict_mean[col] )
        
    return df

In [249]:
df = create_dataset(df, True)

In [361]:
aml_dataset_test = Dataset.get_by_name(ws, 'test_ds', version='latest')
df_test = aml_dataset_test.to_pandas_dataframe()

In [363]:
df_test = create_dataset(df_test, False)

In [258]:
drop_cols = ['CardHolder', 'target', 'group', 'response_att', 'predict', 'group_cat', 'uplift']
train_cols = [col for col in df.columns if col not in drop_cols]

X_train, X_test, y_train, y_test = train_test_split(df, df['target'], stratify = df['group_cat'],
                                                    test_size=200000, random_state=322,)

In [255]:
def custom_metric(answers, take_top_ratio=0.25):
    answers = answers.copy()
    answers.sort_values(by='uplift', inplace=True, ascending=False)
    n_samples = int(np.ceil(answers.shape[0] * take_top_ratio))
    answers = answers.iloc[:n_samples, :]
    answers_test = answers[answers['group'] == 'test']['response_att'].sum() / \
                   answers[answers['group'] == 'test'].shape[0]
    answers_control = answers[answers['group'] == 'control']['response_att'].sum() / \
                      answers[answers['group'] == 'control'].shape[0]
    return (answers_test - answers_control) * 100

In [597]:
class WinModel():
    
    def __init__(self, params, n_seeds, n_folds, train_cols):
        self.params = params
        self.n_seeds = n_seeds
        self.n_folds = n_folds
        self.train_cols = train_cols
        self.bst_list = None
    
    def fit_lgb(self, X, y):
        target = 'target'
        bst_list = []
        self.val_score_list = []
        for i in tqdm(range(self.n_seeds)):
            self.params['random_state'] = i

            tr = lgb.Dataset(np.array(X[self.train_cols]), np.array(y))

            bst = lgb.train(self.params, tr, num_boost_round = 120)

            bst_list += [bst]
                
        self.bst_list = bst_list
        
    
    def predict(self, df):
        pred = [bst.predict(np.array(df[self.train_cols])) for bst in self.bst_list]
        pred = np.sum(pred, axis = 0)
        return pred

In [257]:
model_file_name = 'outputs/model.pkl'

In [681]:
train_cols = ['kmeans', 'gp_diff_max_cheque_count_6m', 'response_sms', 'gp_diff_max_cheque_count_12m', 
              'k_var_days_between_visits_1m', 'max_cheque_count_6m', 'k_var_days_between_visits_15d', 
              'perdelta_days_between_visits_15_30d', 'months_from_register', 'gp_diff_k_var_days_between_visits_3m', 
              'gp_diff_stdev_days_between_visits_15d', 'gp_diff_response_sms', 'gp_diff_k_var_days_between_visits_15d', 
              'max_cheque_count_12m', 'response_viber', 'gp_diff_k_var_days_between_visits_1m', 
              'gp_diff_perdelta_days_between_visits_15_30d', 'k_var_cheque_15d', 'stdev_days_between_visits_15d', 
              'gp_diff_response_viber', 'k_var_cheque_3m', 'k_var_days_between_visits_3m', 'k_var_cheque_category_width_15d', 
              'k_var_disc_per_cheque_15d', 'max_k_var_count_per_cheque_1m', 'food_share_15d', 'promo_share_15d', 
              'stdev_discount_depth_15d', 'max_k_var_sku_price_3m', 'age', 'max_cheque_count_3m', 'food_share_1m', 
              'max_k_var_disc_share_3m', 'k_var_sku_per_cheque_15d', 'sum_cheque_count_6m', 'gp_diff_sum_k_var_sku_price_6m', 
              'mean_discount_depth_15d', 'sum_k_var_disc_share_15d', 'max_k_var_count_per_cheque_3m', 
              'sum_k_var_disc_share_6m', 'max_k_var_disc_share_15d', 'max_k_var_sku_price_1m', 'cheque_count_12m_g48', 
              'k_var_cheque_group_width_15d', 'disc_sum_6m_g34', 'sale_sum_3m_g33', 'sum_k_var_sku_price_6m', 
              'k_var_sku_price_6m_g49', 'sum_sale_count_12m', 'main_format', 'sum_sale_sum_12m', 'sum_k_var_sku_price_3m',
              'k_var_discount_depth_15d', 'k_var_sku_price_6m_g27', 'k_var_sku_price_6m_g24', 'sale_sum_3m_g26', 
              'cheque_count_3m_g25', 'k_var_disc_share_6m_g54', 'cheque_count_3m_g20', 'k_var_disc_share_3m_g24', 
              'k_var_disc_share_15d_g24', 'max_k_var_disc_share_6m', 'max_k_var_disc_share_1m', 'k_var_sku_price_6m_g48',
              'sum_cheque_count_12m', 'cheque_count_6m_g41', 'k_var_sku_price_3m_g48', 'sale_sum_12m_g27', 
              'k_var_count_per_cheque_1m_g49', 'k_var_disc_share_6m_g27']


In [598]:
experiment = Experiment(workspace=ws, name="fs70full_df-experiment")

In [683]:
params_list = [{'learning_rate':0.1,'max_depth':5, 'objective':'binary',
            'min_data_in_leaf': 2500},
              {'learning_rate':0.1,'max_depth':5, 'objective':'binary',
            'min_data_in_leaf': 1500,},
              {'learning_rate':0.1,'max_depth':6, 'objective':'binary',
            'min_data_in_leaf': 2500, },
              {'learning_rate':0.1,'max_depth':6, 'objective':'binary',
            'min_data_in_leaf': 1500,}]


run =  experiment.start_logging()
model_list = []
for params in params_list:
        
        model = WinModel(params, 10, 2, train_cols)
        model.fit_lgb(df, df['target'])
        model_list += model.bst_list
joblib.dump(value = model.bst_list, filename = model_file_name)
run.upload_file(name = model_file_name, path_or_stream = model_file_name)

In [682]:
df_test['uplift'] = 0

for model in model_list:
    df_test['uplift'] += model.predict(df_test[train_cols], num_iteration = 105)


In [619]:
df_test[['CardHolder', 'uplift']].to_csv("final.csv", index = None, sep = ';')

In [None]:

model = best_run.register_model(model_name='best_model', model_path='outputs/model.pkl')