In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse 
import scipy as sp
import time
from scipy.sparse import hstack
import time
import os
data_paths = {}
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data_paths[filename] = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

/kaggle/input/drug-switch-preprocessing/__results__.html
/kaggle/input/drug-switch-preprocessing/custom.css
/kaggle/input/drug-switch-preprocessing/train.parquet
/kaggle/input/drug-switch-preprocessing/__notebook__.ipynb
/kaggle/input/drug-switch-preprocessing/train_labels.parquet
/kaggle/input/drug-switch-preprocessing/test.parquet
/kaggle/input/drug-switch-preprocessing/__output__.json
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/_DS_Store
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/fitness_values_2.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/train_data.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/test_data.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/train_labels.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/Sample Submission.csv


### Utils:

In [2]:
def get_sparse_matrix(data):
    return sparse.csr_matrix(data)

def sparse_vars(a, axis=None):
    """ 
    Variance of sparse matrix a
    var = mean(a**2) - mean(a)**2
    """
    a_squared = a.copy()
    a_squared.data **= 2
    return a_squared.mean(axis) - np.square(a.mean(axis))

def sparse_stds(a, axis=None):
    """
    Standard deviation of sparse matrix a
    std = sqrt(var(a))
    """
    return np.sqrt(sparse_vars(a, axis))

# to get feature stats
def create_fitness_stats(df, cols, pos_idx, neg_idx, nans = True):
    
    stat_df = pd.DataFrame(data = cols, columns=['feature_name'])
    
    if nans:
        ###
        stat_df['avg_0'] = np.nanmean(df[neg_idx,:].astype(float), axis = 0)
        stat_df['avg_1'] = np.nanmean(df[pos_idx,:].astype(float), axis = 0)
        ###
        stat_df['sd_0'] = np.nanstd(df[neg_idx,:].astype(float), axis = 0)
        stat_df['sd_1'] = np.nanstd(df[pos_idx,:].astype(float), axis = 0)
        
    if not nans:
        ###
        stat_df['avg_0'] = np.ravel(df[neg_idx,:].mean(axis = 0))
        stat_df['avg_1'] = np.ravel(df[pos_idx,:].mean(axis = 0))
        ###
        stat_df['sd_0'] = np.ravel(sparse_stds(df[neg_idx,:], axis = 0))
        stat_df['sd_1'] = np.ravel(sparse_stds(df[pos_idx,:], axis = 0))
        
    return stat_df

### recency feature

In [3]:
def create_recency_feature(df):
    start = time.time()
    cdfs = []
    for col in ['event_name', 'specialty', 'plan_type']:
        cat_df = df.groupby(["id", col]).agg({"event_time":np.min}).unstack(level=col)
        cat_df.columns = ['__'.join(['recency', col, name,]) for name in cat_df.columns.droplevel()]
        cdfs.append(cat_df)
    res_df = pd.concat(cdfs, axis = 1)
    # res_df = res_df.fillna('##')
    end = time.time()
    print('time taken (in secs) for recency features creation:', end-start)
    
    res_idx, res_col = np.array(res_df.index), np.array(res_df.columns)
    res_data = res_df.values

    del res_df
    return res_idx, res_col, res_data

### frequency feature

In [4]:
def create_frequency_feature(temp_df):
    """
    function to create frequency feature 
    """
    start = time.time()
    cat_dfs = []
    for num in np.arange(1080,0,-30):
        temp_df.loc[temp_df['event_time'] > int(num), 'event_time'] = np.nan
        for col in ['event_name', 'specialty', 'plan_type']:
            cat_df = temp_df.groupby(["id", col],).agg({"event_time": 'count'}).unstack(level=col)
            cat_df.columns = ['__'.join(['frequency', col, name, str(int(num))]) for name in cat_df.columns.droplevel()]
            cat_dfs.append(cat_df)
    res_df = pd.concat(cat_dfs, axis = 1)
    res_df = res_df.fillna(0)
    end = time.time()
    print('time taken (in secs) for frequency feature creation:', end-start)
    
    res_idx, res_col = np.array(res_df.index), np.array(res_df.columns)
    res_data = get_sparse_matrix(res_df.values)
    
    del res_df
    # get data
    return res_idx, res_col, res_data

### NormChange

In [5]:
def get_post_df(temp_post_df):
    """
    function to create feature matrix greather than time period for comparison

    """
    
    cat_dfs = []
    for num in np.arange(1080/2,0,-30):
        # making > null i.e keeping <=
        temp_post_df.loc[temp_post_df['event_time'] > int(num), 'event_time'] = np.nan
        for col in ['event_name', 'specialty', 'plan_type']:
            cat_df = temp_post_df.groupby(["id", col]).agg({"event_time": 'count'}).unstack(level=col)
            cat_df = cat_df/num
            cat_df.columns = ['__'.join(['normChange', col, name, str(int(num))]) for name in cat_df.columns.droplevel()]
            cat_dfs.append(cat_df)  
    post_df = pd.concat(cat_dfs, axis = 1)
    return post_df.fillna(0)


def get_pre_df(temp_pre_df):
    """
    function to create feature matrix less than time period for comparison
    """
    
    event_time_max = temp_pre_df['event_time'].max()
    cat_dfs = []
    for num in np.arange(0,(1080/2)+1,30)[1:]:
        # making <= null i.e keeping >
        temp_pre_df.loc[temp_pre_df['event_time'] <= int(num), 'event_time'] = np.nan
        for col in ['event_name', 'specialty', 'plan_type']:
            cat_df = temp_pre_df.groupby(["id", col]).agg({"event_time": 'count'}).unstack(level=col)
            cat_df = cat_df/(event_time_max-num)
            cat_df.columns = ['__'.join(['normChange', col, name, str(int(num))]) for name in cat_df.columns.droplevel()]
            cat_dfs.append(cat_df)
    pre_df = pd.concat(cat_dfs, axis = 1)        
    return pre_df.fillna(0)


def create_norm_feature(temp_df):
    """
    function to create norm change feature
    """
    
    start = time.time()
    
    post_df = get_post_df(temp_df)
    pre_df = get_pre_df(temp_df)
    
    res_col = np.array(pre_df.columns)
    post_df = post_df[res_col]
    r = np.where(post_df > pre_df, 1, 0)
    
    res_idx = np.array(post_df.index)
    res_data = get_sparse_matrix(r)
    
    end = time.time()
    print('time taken (in secs) for norm change feature creation:', end-start)
    
    # df1.where(df1.values==df2.values)
    # post_df.where(post_df > pre_df, 1, 0, inplace = True)
    del post_df, pre_df
    return res_idx, res_col, res_data


### transform & save train/test data

In [6]:
def transform_data(data_df, target_df = None):
    """
    function to transform given matrix into feature matrix
    """
    rec_idx, rec_col, rec_data = create_recency_feature(data_df)
    freq_idx, freq_col, freq_data = create_frequency_feature(data_df)
    norm_idx, norm_col, norm_data = create_norm_feature(data_df)

    # with hstack function we are concatinating a sparse matrix and a dense matirx :)
    feat_df = hstack((rec_data, freq_data, norm_data))
    print('Final feature matrix shape:', feat_df.shape)
    
    # merge all the feature names
    feat_names = list(rec_col) + list(freq_col) + list(norm_col)
    
    if isinstance(target_df, pd.core.frame.DataFrame):
        # get +ve & -ve indices
        one_idx = target_df[target_df['outcome_flag'] == 1]['id'].index.tolist()
        zero_idx = target_df[target_df['outcome_flag'] == 0]['id'].index.tolist()
        
        # calculate fitness values of features
        rcdf = create_fitness_stats(rec_data, rec_col, one_idx, zero_idx, nans = True)
        fqdf = create_fitness_stats(freq_data, freq_col, one_idx, zero_idx, nans = False)
        nrdf = create_fitness_stats(norm_data, norm_col, one_idx, zero_idx, nans=False)
        fit_df = rcdf.append(fqdf).append(nrdf)
        fit_df.reset_index(drop=1)
        return feat_df, feat_names, fit_df
    
    return feat_df, feat_names

In [7]:
def save_data(data, path):
    return sparse.save_npz(path, data)

def load_data(path):
    return sparse.load_npz(path)

### train data

In [8]:
train_df = pd.read_parquet(data_paths['train.parquet'])
print('train data:', train_df.shape, train_df.columns)

target_df = pd.read_parquet(data_paths['train_labels.parquet'])
print('train labels', target_df.shape, target_df.columns)

train data: (14446880, 7) Index(['patient_id', 'event_name', 'event_time', 'specialty', 'plan_type',
       'patient_payment', 'id'],
      dtype='object')
train labels (16683, 3) Index(['patient_id', 'outcome_flag', 'id'], dtype='object')


In [9]:
train_feat, train_feat_names, train_fit_df = transform_data(train_df, target_df)
train_feat.shape, len(train_feat_names)

time taken (in secs) for recency features creation: 7.36030387878418
time taken (in secs) for frequency feature creation: 267.0937509536743
time taken (in secs) for norm change feature creation: 249.55682754516602
Final feature matrix shape: (16683, 41525)


  keepdims=keepdims)


((16683, 41525), 41525)

In [10]:
save_data(train_feat, 'train_features.npz')
del train_df, target_df

### test data

In [11]:
test_df = pd.read_parquet(data_paths['test.parquet'])
print(test_df.shape, test_df.columns)

(6256130, 7) Index(['patient_id', 'event_name', 'event_time', 'specialty', 'plan_type',
       'patient_payment', 'id'],
      dtype='object')


In [12]:
test_feat, test_feat_names = transform_data(test_df)
test_feat.shape, len(test_feat_names)

time taken (in secs) for recency features creation: 2.2142653465270996
time taken (in secs) for frequency feature creation: 104.43709707260132
time taken (in secs) for norm change feature creation: 114.7191002368927
Final feature matrix shape: (7148, 41525)


((7148, 41525), 41525)

In [13]:
save_data(test_feat, 'test_features.npz')
del test_df

In [14]:
assert(train_feat_names == test_feat_names)

In [15]:
pd.DataFrame(train_feat_names, columns = ['feature']).to_csv('train_feature_names.csv', index = False)
pd.DataFrame(test_feat_names, columns = ['feature']).to_csv('test_feature_names.csv', index = False)

### fitness calculation

In [16]:
def fitness_calculation(data):
    if ((data['sd_0'] == 0 ) and (data['sd_1'] == 0)) and (((data['avg_0'] == 0) and (data['avg_1'] != 0)) or ((data['avg_0'] != 0) and (data['avg_1'] == 0))):
        return 9999999999
    elif (((data['sd_0'] == 0 ) and (data['sd_1'] != 0)) or ((data['sd_0'] != 0) and (data['sd_1'] == 0))) and (data['avg_0'] == data['avg_1']):
        return 1
    elif ((data['sd_0'] != 0 ) and (data['sd_1'] != 0)) and (data['avg_0'] != 0):
        return ((data['avg_1']/data['sd_1'])/(data['avg_0']/data['sd_0']))
    elif ((data['sd_0'] != 0 ) and (data['sd_1'] != 0)) and ((data['avg_0'] == 0) and (data['avg_1'] != 0)):
        return 9999999999
    else:
        return 1

In [17]:
# train_fitness_val 
train_fit_df['fitness_value'] = train_fit_df.apply(fitness_calculation, axis = 1)
train_fit_df.to_csv('train_fitness_values.csv', index = None)