In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse 
import scipy as sp
import time
import random
import os
data_paths = {}
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data_paths[filename] = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))


/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/_DS_Store
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/fitness_values_2.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/train_data.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/test_data.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/train_labels.csv
/kaggle/input/drug-switch-classification/DS_ML_Recruitment_V2.0/Sample Submission.csv


In [2]:
def create_num_id(df):
    """
    create an unique numerical index for patients
    """
    df['id'] = df['patient_id'].apply(lambda x:int(x.split('_')[1]))
    return df

def sort_data(df, col_order=["id", 'event_name', 'specialty', 'plan_type']):
    """
    to sort the data in the predefined order
    """
    df.sort_values(col_order, inplace = True)
    df.reset_index(drop=1, inplace=True)
    return df

In [3]:
target_df = pd.read_csv(data_paths['train_labels.csv'])
print(target_df.shape, target_df.columns)

(16683, 2) Index(['patient_id', 'outcome_flag'], dtype='object')


In [4]:
## get +ve & -ve indices
# one_idx = target_df[target_df['outcome_flag'] == 1]['id'].index.tolist()
# zero_idx = target_df[target_df['outcome_flag'] == 0]['id'].index.tolist()

target_df = create_num_id(target_df)
target_df = sort_data(target_df, col_order=['id'])
target_df.to_parquet('train_labels.parquet', index = False)

In [5]:
train_df = pd.read_csv(data_paths['train_data.csv'])
print(train_df.shape, train_df.columns)

(14446880, 6) Index(['patient_id', 'event_name', 'event_time', 'specialty', 'plan_type',
       'patient_payment'],
      dtype='object')


In [6]:
train_df = create_num_id(train_df)
train_df = sort_data(train_df)

In [7]:
test_df = pd.read_csv(data_paths['test_data.csv'])
print(test_df.shape, test_df.columns)

(6256395, 6) Index(['patient_id', 'event_name', 'event_time', 'specialty', 'plan_type',
       'patient_payment'],
      dtype='object')


In [8]:
test_df = create_num_id(test_df)

In [9]:
cat_columns = ['event_name', 'specialty', 'plan_type']
train_unique_col_values = {col:train_df[col].unique() for col in cat_columns}
test_unique_col_values = {col:test_df[col].unique() for col in cat_columns}

# train missed
train_missed = {k:[] for k in cat_columns}
for col in cat_columns:
    for col_type in test_unique_col_values[col]:
        if col_type not in train_unique_col_values[col]:
            train_missed[col].append(col_type)
    print('missing types in train', col, len(train_missed[col]))

# missed values
test_missed = {k:[] for k in cat_columns}
for col in cat_columns:
    for col_type in train_unique_col_values[col]:
        if col_type not in test_unique_col_values[col]:
            test_missed[col].append(col_type)
    print('missing types in test', col, len(test_missed[col]))

missing types in train event_name 0
missing types in train specialty 13
missing types in train plan_type 0
missing types in test event_name 0
missing types in test specialty 36
missing types in test plan_type 0


In [10]:
# get all patient test ids
test_patient_ids = test_df['patient_id'].values

# create duplicate values for missing data
dup_values = []
for col in cat_columns:
    iter_items = test_missed[col]
    if len(iter_items) > 0:
        for item in iter_items:
            rc = random.choice(test_patient_ids)
            et = np.nan
            pa = np.nan
            if col == 'event_name':
                en = item
                sn = random.choice(train_unique_col_values['specialty'])
                pt = random.choice(train_unique_col_values['plan_type'])
            if col == 'specialty':
                en = random.choice(train_unique_col_values['event_name'])
                sn = item
                pt = random.choice(train_unique_col_values['plan_type'])     
            if col == 'plan_type':
                en = random.choice(train_unique_col_values['event_name'])
                sn = random.choice(train_unique_col_values['specialty'])
                pt = item
            dup_values.append([rc, en, et, sn, pt, pa, int(rc.split('_')[1])])

In [11]:
test_df = test_df[~(test_df['specialty'].isin(train_missed['specialty']))].reset_index(drop = 1)

In [12]:
dup_df = pd.DataFrame(data = dup_values, columns = test_df.columns)
dup_df.shape, dup_df.columns

((36, 7),
 Index(['patient_id', 'event_name', 'event_time', 'specialty', 'plan_type',
        'patient_payment', 'id'],
       dtype='object'))

In [13]:
test_df = test_df.append(dup_df)
test_df = sort_data(test_df)
test_df.shape, test_df.columns

((6256130, 7),
 Index(['patient_id', 'event_name', 'event_time', 'specialty', 'plan_type',
        'patient_payment', 'id'],
       dtype='object'))

In [14]:
train_df.to_parquet('train.parquet', index = False)
test_df.to_parquet('test.parquet', index = False)