In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('/kaggle/input/armanik-patient-drugswitch/Drug_Switch_Prediction_ParticipantsData/train_data.csv')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    sparse_flag = False
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = reduce_mem_usage(train)

In [None]:
from joblib import Parallel, delayed
import multiprocessing

from datetime import datetime 

In [None]:
import gc

In [None]:
def frequency_feature_generator(train):
    patient_uni_df = pd.DataFrame({'patient_id': train['patient_id'].unique(), 'key': 1})
    
    event_uni_df = pd.DataFrame({'event_name': train['event_name'].unique(), 'key':1})
    patient_event_uni_df = patient_uni_df.merge(event_uni_df, on='key')
       
    specialty_uni_df = pd.DataFrame({'specialty': train['specialty'].unique(), 'key':1})
    patient_specialty_uni_df = pd.merge(patient_uni_df, specialty_uni_df, on='key')
    
    plan_uni_df = pd.DataFrame({'plan_type': train['plan_type'].unique(), 'key':1})
    patient_plan_uni_df = pd.merge(patient_uni_df, plan_uni_df, on='key')
    
    df_selector = {
        'event_name': patient_event_uni_df,
        'specialty': patient_specialty_uni_df,
        'plan_type': patient_plan_uni_df
    }
    
    def frequency_calculator(time):
        new_df = pd.DataFrame({'patient_id': train['patient_id'].unique()}).set_index('patient_id')
                
        t1 = train[train['event_time'] <= time].reset_index()
        
        cat_cols = ['event_name', 'specialty', 'plan_type']
        for cat in cat_cols:
            grouped_df=t1.groupby(['patient_id', cat])['event_time'].count().reset_index()
            temp_df=pd.merge(df_selector[cat],grouped_df, on=['patient_id', cat],how='left')
            grouped_df=None
            temp_1=temp_df.pivot(index='patient_id', columns=cat, values='event_time')
            temp_1=temp_1.rename_axis(None, axis=1)
            temp_1.fillna(0,inplace=True)    
            temp_2=temp_1.add_prefix('frequency_'+str(time)+'_'+cat+'__')
#             temp_2=temp_2.rename(columns = {'frequency_'+str(i)+'_event_name__patient_id': 'patient_id'})
            temp_1=None
            new_df=pd.merge(new_df,temp_2, left_index=True, right_index=True, how='left')
            grouped_df=temp_df=temp_1=temp2=None
            gc.collect()
        new_df.reset_index(inplace=True)
        return new_df
    
    num_cores = 4     
#     pool = multiprocessing.Pool(num_cores)
#     results = pool.map(frequency_calculator, range(30,1110,30))
    results = Parallel(n_jobs=num_cores)(delayed(frequency_calculator)(i) for i in range(30,1110,30))
    final_frequency=pd.DataFrame({'patient_id': train.patient_id.unique()})
    for i in range(0,36):
        final_frequency=pd.merge(final_frequency,results[i], on='patient_id', how='left')
    
    return final_frequency

In [None]:
start_time = datetime.now()
print(start_time)
freq_df = frequency_feature_generator(train)
print("{} minutes".format((datetime.now() - start_time).total_seconds() / 60))

In [None]:
freq_df.head()