# Genentech Cervical Cancer - Big Table Merge

https://www.kaggle.com/c/cervical-cancer-screening/

In [1]:
# imports
import sys # for stderr
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# settings 
%logstop
%logstart  -o 'ipynb.log' rotate
plt.style.use('ggplot')
# constants
# plt.rcParams['figure.figsize'] = (10.0, 10.0)
# pd.set_option('display.max_rows', 50)
# pd.set_option('display.max_columns', 50)

Logging hadn't been started.
Activating auto-logging. Current session state plus future input saved.
Filename       : ipynb.log
Mode           : rotate
Output logging : True
Raw input log  : False
Timestamping   : False
State          : active


In [3]:
# versions 
import sys
print(pd.datetime.now())
print('Python: '+sys.version)
print('numpy: '+np.__version__)
print('pandas: '+pd.__version__)
print('sklearn: '+skl.__version__)

2016-01-31 07:56:14.306282
Python: 2.7.11 |Anaconda 2.4.0 (x86_64)| (default, Dec  6 2015, 18:57:58) 
[GCC 4.2.1 (Apple Inc. build 5577)]
numpy: 1.10.2
pandas: 0.17.1
sklearn: 0.17


## Load Train/Test

In [4]:
train_file = './input/patients_train.csv.gz'
train = pd.read_csv(train_file)
train.set_index('patient_id', inplace=True)
train.drop('patient_gender', axis = 1, inplace = True )
train_exclude = pd.read_csv('./input/train_patients_to_exclude.csv', header=None, names=['patient_id'])
train.drop(train_exclude.patient_id, inplace=True)
train.shape

(1157817, 6)

In [5]:
original_train_rows = train.shape[0]

In [6]:
train[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0


In [7]:
test_file = './input/patients_test.csv.gz'
test = pd.read_csv(test_file)
test.set_index('patient_id', inplace=True)
test.drop( 'patient_gender', axis = 1, inplace = True )
test_exclude = pd.read_csv('./input/test_patients_to_exclude.csv', header=None, names=['patient_id'])
test.drop(test_exclude.patient_id, inplace=True)
test.shape

(1701813, 5)

In [8]:
test[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN
103994412,27-29,CA,ALL OTHER,UNKNOWN,UNKNOWN


In [9]:
original_test_rows = test.shape[0]

## Load Features

In [13]:
def fheaders(filename):
    feature = pd.read_csv(filename, nrows=2)
    print(filename, feature.columns)
    return

In [9]:
fdir = './features/'

In [65]:

train_features = [
    'activity/activity_min_max.csv',
    'visits.csv.gz',
    'train_diagnosis_cbsa_counts.csv.gz',
    'train_patient_cbsa.csv.gz',
    'train_rx_payment.csv',
    'train_procedure_counts.csv.gz',
    'train_obg_pct.csv',
    'train_fmfp.csv',
#    'procedure/procedure_counts_selected.csv.gz',
]


In [66]:
for f in train_features:
    fheaders(fdir+f)

('./features/activity/activity_min_max.csv', Index([u'patient_id', u'first_visit', u'last_visit', u'date_delta',
       u'first_after_2012'],
      dtype='object'))
('./features/visits.csv.gz', Index([u'patient_id', u'visits'], dtype='object'))
('./features/train_diagnosis_cbsa_counts.csv.gz', Index([u'patient_id', u'cbsa'], dtype='object'))
('./features/train_patient_cbsa.csv.gz', Index([u'patient_id', u'cbsa', u'num_visits'], dtype='object'))
('./features/train_rx_payment.csv', Index([u'patient_id', u'RX_ASSISTANCE', u'RX_CASH', u'RX_COMMERCIAL',
       u'RX_MANAGED_MEDICAID', u'RX_MEDICAID', u'RX_MEDICARE', u'num_rx'],
      dtype='object'))
('./features/train_procedure_counts.csv.gz', Index([u'patient_id', u'num_procedures'], dtype='object'))
('./features/train_obg_pct.csv', Index([u'patient_id', u'obg_id', u'obg_diagnosis_count', u'obg_code',
       u'obg_cbsa', u'obg_screen_pct', u'obg_patient_count',
       u'obg_month_frequency', u'obg_claim_count', u'visited_obg'],
      dtype

In [17]:
xxx = pd.read_csv('./features/train_physician_family.csv.gz')
train_physician_family[:3]

Unnamed: 0,patient_id,FM,FP,FPG,FPP,FSM,PLN
0,84548607,13098491,,,,,
1,84548626,13181786,,,,,
2,84548666,24257215,29088164.0,,,,


In [67]:
fm_cols = ['patient_id','FM','FP']

In [51]:
train_physician_family[fm_cols].to_csv('./features/train_fmfp.csv', index=False)

In [23]:
test_physician_family.count()

patient_id    430246
FM            407828
FP            307532
FPG            14067
FPP              714
FSM            34677
PLN               21
dtype: int64

In [59]:
rx_cols = list(test_physician_family.columns).extend(['num_rx'])
print(rx_cols)
#rx_cols.extend(['num_rx'])
rx_cols

None


In [57]:
test_physician_family.columns

Index([u'patient_id', u'FM', u'FP', u'FPG', u'FPP', u'FSM', u'PLN'], dtype='object')

In [22]:
test_physician_family = pd.read_csv('./features/test_physician_family.csv.gz')
test_physician_family[:3]

Unnamed: 0,patient_id,FM,FP,FPG,FPP,FSM,PLN
0,84548780,47413525,,,,35477000.0,
1,84548841,13183196,,,,,
2,84548915,42659493,,,,,


In [69]:
test_physician_family[fm_cols].to_csv('./features/test_fmfp.csv', index=False)

In [27]:
activity = pd.read_csv('./features/activity/activity_min_max_month_year.csv.gz')

In [41]:
activity[:10]

Unnamed: 0,patient_id,max_month,max_year,min_month,min_year,first_visit,last_visit,date_delta,first_after_2012
0,84548607,2,2011,4,2008,2008-04-01,2011-02-01,34,0
1,84548626,12,2014,12,2008,2008-12-01,2014-12-01,73,0
2,84548666,12,2014,11,2008,2008-11-01,2014-12-01,74,0
3,84548780,12,2014,3,2008,2008-03-01,2014-12-01,82,0
4,84548805,12,2014,10,2008,2008-10-01,2014-12-01,75,0
5,84548821,12,2014,1,2008,2008-01-01,2014-12-01,84,0
6,84548841,12,2014,1,2008,2008-01-01,2014-12-01,84,0
7,84548915,12,2014,1,2008,2008-01-01,2014-12-01,84,0
8,84549017,11,2014,2,2008,2008-02-01,2014-11-01,82,0
9,84549024,11,2014,1,2008,2008-01-01,2014-11-01,83,0


In [29]:
activity.dtypes

patient_id    int64
max_month     int64
max_year      int64
min_month     int64
min_year      int64
dtype: object

In [33]:
activity['first_visit'] = pd.to_datetime(activity.min_year.astype(str)+activity.min_month.astype(str),format='%Y%m')
activity['last_visit'] = pd.to_datetime(activity.max_year.astype(str)+activity.max_month.astype(str),format='%Y%m')

In [35]:
activity['date_delta'] = ((activity.last_visit - activity.first_visit).dt.days / 30).astype(int)

In [40]:
activity['first_after_2012'] = (activity.first_visit > pd.datetime(2012,1,1)).astype(int)

In [50]:
acols = ['patient_id','first_visit','last_visit','date_delta','first_after_2012']
activity[acols].to_csv('./features/activity/activity_min_max.csv', index=False)

In [11]:
# Train Files
fdir = './features/'
visits = fdir+'visits.csv.gz'
train_diagnosis_cbsa_counts = fdir+'train_diagnosis_cbsa_counts.csv.gz'
train_patient_cbsa = fdir+'train_patient_cbsa.csv.gz'
train_procedure_counts = fdir+'train_procedure_counts.csv.gz'
train_surgical_claim_type = fdir+'train_surgical_claim_type.csv.gz'
train_surgical_place_of_service = fdir+'train_surgical_place_of_service.csv.gz'
train_surgical_primary_physician_role = fdir+'train_surgical_primary_physician_role.csv.gz'
train_surgical_procedure_type_code = fdir+'train_surgical_procedure_type_code.csv.gz'

In [12]:
train_features = [
    visits, 
    train_patient_cbsa,
    train_procedure_counts,
    train_surgical_place_of_service,
    train_surgical_procedure_type_code
]

In [13]:
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1


In [14]:
for f in train_features:
    train = fmerge(train, f)

In [15]:
import gc
gc.collect()

157

In [16]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

In [17]:
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,0003,0004,0005,0006,HX01,HX02,HX03,HX04,HX05,HXPR
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,,,,,,,,,,
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,,,,,,,,,,
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1,2,35620,43,293,...,0.0,0.0,0.0,0.0,6.0,4.0,0.0,0.0,0.0,14.0


In [18]:
# Test Files
fdir = './features/'
visits = fdir+'visits.csv.gz'
test_diagnosis_cbsa_counts = fdir+'test_diagnosis_cbsa_counts.csv.gz'
test_patient_cbsa = fdir+'test_patient_cbsa.csv.gz'
test_procedure_counts = fdir+'test_procedure_counts.csv.gz'
test_surgical_claim_type = fdir+'test_surgical_claim_type.csv.gz'
test_surgical_place_of_service = fdir+'test_surgical_place_of_service.csv.gz'
test_surgical_primary_physician_role = fdir+'test_surgical_primary_physician_role.csv.gz'
test_surgical_procedure_type_code = fdir+'test_surgical_procedure_type_code.csv.gz'

In [19]:
test_features = [
    visits, 
    test_patient_cbsa,
    test_procedure_counts,
    test_surgical_place_of_service,
    test_surgical_procedure_type_code
]

In [20]:
for f in test_features:
    test = fmerge(test, f)

In [21]:
if test.shape[0] != original_test_rows:
    raise Exception('bad merge?')

In [22]:
test.shape

(1701813, 26)

In [23]:
test[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,CLINIC,...,0003,0004,0005,0006,HX01,HX02,HX03,HX04,HX05,HXPR
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN,144,19100,11,135,,...,,,,,,,,,,
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN,92,26900,8,85,,...,,,,,,,,,,
103994412,27-29,CA,ALL OTHER,UNKNOWN,UNKNOWN,168,31080,37,417,,...,,,,,,,,,,
318658812,27-29,TN,ALL OTHER,UNKNOWN,UNKNOWN,72,32820,14,150,0.0,...,0.0,0.0,0.0,0.0,6.0,2.0,2.0,0.0,0.0,6.0


## num_diagnosis

In [24]:
diagnosis_cbsa_count_train = pd.read_csv(fdir+'diagnosis_cbsa_count_train.csv.gz')
diagnosis_cbsa_count_train[:3]

Unnamed: 0,patient_id,cbsa,count
0,84548607,18620,1
1,84548607,19100,1
2,84548607,46100,1


In [25]:
train_diagnosis_count = diagnosis_cbsa_count_train.groupby('patient_id')['count'].sum()
train['num_diagnosis'] = train_diagnosis_count

In [26]:
diagnosis_cbsa_count_test = pd.read_csv(fdir+'diagnosis_cbsa_count_test.csv.gz')
diagnosis_cbsa_count_test[:3]

Unnamed: 0,patient_id,cbsa,count
0,84548780,21500,1
1,84548780,26180,18
2,84548841,11460,1


In [27]:
test_diagnosis_count = diagnosis_cbsa_count_test[['patient_id','count']].groupby('patient_id')['count'].sum()
test['num_diagnosis'] = test_diagnosis_count

In [28]:
test[:2]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,CLINIC,...,0004,0005,0006,HX01,HX02,HX03,HX04,HX05,HXPR,num_diagnosis
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN,144,19100,11,135,,...,,,,,,,,,,11
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN,92,26900,8,85,,...,,,,,,,,,,8


In [29]:
gc.collect()

239

In [30]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

## rx_payment

In [31]:
rx_payment = pd.read_csv(fdir+'rx_payment.csv.gz')
rx_payment[:5]

Unnamed: 0,patient_id,payment
0,84548607,COMMERCIAL
1,84548626,CASH
2,84548626,COMMERCIAL
3,84548666,COMMERCIAL
4,84548780,CASH


In [32]:
rx_count = rx_payment.groupby('patient_id').payment.count()

In [33]:
rx_count[:10]

patient_id
84548607    1
84548626    2
84548666    1
84548780    3
84548805    3
84548821    2
84548841    2
84548915    3
84549017    5
84549024    2
Name: payment, dtype: int64

In [34]:
train['num_rx'] = rx_count
test['num_rx'] = rx_count

In [35]:
rx_pivot = rx_payment.pivot(index='patient_id', columns='payment', values='payment')

In [36]:
rx_pivot.columns = ['RX_ASSISTANCE','RX_CASH','RX_COMMERCIAL','RX_MANAGED_MEDICAID','RX_MEDICAID','RX_MEDICARE']

In [37]:
rx_pivot[:10]

Unnamed: 0_level_0,RX_ASSISTANCE,RX_CASH,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84548607,,,COMMERCIAL,,,
84548626,,CASH,COMMERCIAL,,,
84548666,,,COMMERCIAL,,,
84548780,,CASH,COMMERCIAL,"MANAGED MEDICAID""""",,
84548805,,CASH,COMMERCIAL,"MANAGED MEDICAID""""",,
84548821,,CASH,COMMERCIAL,,,
84548841,,CASH,COMMERCIAL,,,
84548915,,,COMMERCIAL,"MANAGED MEDICAID""""",MEDICAID,
84549017,,CASH,COMMERCIAL,"MANAGED MEDICAID""""",MEDICAID,MEDICARE
84549024,,CASH,COMMERCIAL,,,


In [38]:
rx_plans = pd.get_dummies(rx_pivot)

In [39]:
rx_plans.columns = ['RX_ASSISTANCE','RX_CASH','RX_COMMERCIAL','RX_MANAGED_MEDICAID','RX_MEDICAID','RX_MEDICARE']

In [40]:
rx_plans[:10]

Unnamed: 0_level_0,RX_ASSISTANCE,RX_CASH,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84548607,0,0,1,0,0,0
84548626,0,1,1,0,0,0
84548666,0,0,1,0,0,0
84548780,0,1,1,1,0,0
84548805,0,1,1,1,0,0
84548821,0,1,1,0,0,0
84548841,0,1,1,0,0,0
84548915,0,0,1,1,1,0
84549017,0,1,1,1,1,1
84549024,0,1,1,0,0,0


In [41]:
train = pd.merge(train, rx_plans, left_index=True, right_index=True, how ='left')

In [42]:
test = pd.merge(test, rx_plans, left_index=True, right_index=True, how ='left')

In [43]:
train[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,HX05,HXPR,num_diagnosis,num_rx,RX_ASSISTANCE,RX_CASH,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,,,14,2,0,1,1,0,0,0
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,0.0,2.0,21,2,0,1,1,0,0,0
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,,,6,2,0,1,1,0,0,0


In [44]:
train.shape, test.shape

((1157817, 35), (1701813, 34))

In [45]:
gc.collect()

362

In [46]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

## pract_screen_pct

In [47]:
patient_pract = pd.read_csv(fdir+'diagnosis_patient_practitioner_train.csv.gz')
patient_pract.set_index('patient_id', inplace=True)
patient_pract[:2]

Unnamed: 0_level_0,primary_practitioner_id,cbsa
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548607,12847096,46340
84548607,12923026,46340


In [48]:
patient_pract.shape

(25364706, 2)

In [49]:
prime_pract = pd.merge(pd.DataFrame(train.is_screener), patient_pract, left_index=True, right_index=True, how='left')
prime_pract.drop(['cbsa'], axis=1, inplace=True)
prime_pract[:3]

Unnamed: 0_level_0,is_screener,primary_practitioner_id
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548607,0,12847096
84548607,0,12923026
84548607,0,12930342


In [50]:
prime_pract.reset_index(inplace=True)
prime_pract.drop('patient_id', axis=1, inplace=True)

In [51]:
prime_pract = prime_pract.sort_values(by='primary_practitioner_id')
prime_pract[:5]

Unnamed: 0,is_screener,primary_practitioner_id
11820982,1,12468727
18167464,1,12468727
24566777,1,12468727
19494255,1,12468727
19802299,1,12469219


In [52]:
pract_g = prime_pract.groupby('primary_practitioner_id')
ppp = pd.DataFrame(pract_g.is_screener.mean())  # primary practioner percentage
ppp.iloc[:5]

Unnamed: 0_level_0,is_screener
primary_practitioner_id,Unnamed: 1_level_1
12468727,1.0
12469219,0.333333
12469795,1.0
12470070,1.0
12470221,1.0


In [53]:
patient_prime = pd.merge(patient_pract, ppp, left_on='primary_practitioner_id', right_index=True, how='left')
patient_prime.iloc[:5]

Unnamed: 0_level_0,primary_practitioner_id,cbsa,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84548607,12847096,46340,0.494
84548607,12923026,46340,0.534884
84548607,12930342,46100,0.437838
84548607,12993738,46340,0.463895
84548607,13001412,46340,0.453581


In [54]:
patient_prime_screen_pct = pd.DataFrame(patient_prime.groupby(level=0).is_screener.max())  # COULD TAKE mean()
patient_prime_screen_pct.columns=['pract_screen_pct']
patient_prime_screen_pct.iloc[:5]

Unnamed: 0_level_0,pract_screen_pct
patient_id,Unnamed: 1_level_1
84548607,0.647059
84548626,0.965714
84548666,0.877778
84548805,1.0
84548821,0.97426


In [55]:
train['pract_screen_pct'] = patient_prime_screen_pct.pract_screen_pct

In [56]:
test_patient_pract = pd.read_csv(fdir+'diagnosis_patient_practitioner_test.csv.gz')
test_patient_pract.set_index('patient_id', inplace=True)
test_patient_pract[:2]

Unnamed: 0_level_0,primary_practitioner_id,cbsa
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548780,12755099,26180
84548780,14102147,26180


In [57]:
test_patient_prime = pd.merge(test_patient_pract, ppp, left_on='primary_practitioner_id', right_index=True, how='left')
test_patient_prime.iloc[:5]

Unnamed: 0_level_0,primary_practitioner_id,cbsa,is_screener
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84548780,12755099,26180,0.476636
84548780,14102147,26180,0.490196
84548780,14142454,26180,0.934579
84548780,14357789,26180,0.465517
84548780,16955346,26180,0.503876


In [58]:
test_patient_prime_screen_pct = pd.DataFrame(test_patient_prime.groupby(level=0).is_screener.max()) # COULD TAKE mean()
test_patient_prime_screen_pct.columns=['pract_screen_pct']
test_patient_prime_screen_pct.iloc[:5]

Unnamed: 0_level_0,pract_screen_pct
patient_id,Unnamed: 1_level_1
84548780,1.0
84548841,1.0
84548915,1.0
84549024,0.75
84549114,0.888889


In [59]:
test['pract_screen_pct'] = test_patient_prime_screen_pct.pract_screen_pct

In [60]:
gc.collect()

164

In [61]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

## cbsa_screen_pct

In [62]:
patient_cbsas = pd.merge(pd.DataFrame(train.is_screener), patient_pract, left_index=True, right_index=True, how='left')
patient_cbsas[:3]

Unnamed: 0_level_0,is_screener,primary_practitioner_id,cbsa
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84548607,0,12847096,46340
84548607,0,12923026,46340
84548607,0,12930342,46100


In [63]:
patient_cbsas.reset_index(inplace=True)
patient_cbsas.drop(['patient_id','primary_practitioner_id'], axis=1, inplace=True)
patient_cbsas[:3]

Unnamed: 0,is_screener,cbsa
0,0,46340
1,0,46340
2,0,46100


In [64]:
cbsa_g = patient_cbsas.groupby('cbsa')
cbsa_pct = pd.DataFrame(cbsa_g.is_screener.mean())  # cbsa percentage screened at that location
cbsa_pct.iloc[:5]

Unnamed: 0_level_0,is_screener
cbsa,Unnamed: 1_level_1
10100,0.59273
10140,0.335737
10180,0.390896
10220,0.571952
10260,0.369565


In [65]:
cbsa_pct.columns = ['cbsa_pct']

In [66]:
train = pd.merge(train, cbsa_pct, left_on='cbsa', right_index=True, how='left')

In [67]:
test = pd.merge(test, cbsa_pct, left_on='cbsa', right_index=True, how='left')

In [68]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

## age_pct

In [69]:
# age_pct = train[['patient_age_group','is_screener']].groupby('patient_age_group').is_screener.mean()
# age_pct

In [70]:
age_pct_file = fdir+'age_pct.csv'

In [71]:
train = fjoin(train, age_pct_file, 'patient_age_group')
test = fjoin(test, age_pct_file, 'patient_age_group')

In [72]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

## state_pct

In [73]:
state_pct = fdir+'state_screen_percent.csv'

In [74]:
train = fjoin(train, state_pct, 'patient_state')
test = fjoin(test, state_pct, 'patient_state')

In [75]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

##  diagnosis_code features

In [76]:
# from sqlalchemy import create_engine
# engine = create_engine('postgresql://paulperry:@localhost:5432/ccancer') 

In [77]:
# q1 = "select t1.patient_id, diagnosis_code from diagnosis t1 \
#     right join patients_train t2 on (t1.patient_id=t2.patient_id) where diagnosis_code in ('632','650')"

In [78]:
# diagf = pd.read_sql_query(q1, engine)

In [79]:
train_key_diagnosis = pd.read_csv(fdir+'train_key_diagnosis.csv.gz')
train_key_d = pd.crosstab(train_key_diagnosis.patient_id,train_key_diagnosis.diagnosis_code) 
train_key_d[:5]

diagnosis_code,632,650
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84552398,0,1
84553164,1,0
84553713,0,1
84554799,2,2
84557447,1,0


In [80]:
# spot check this
# train_key_diagnosis[train_key_diagnosis.patient_id == 84554799]

In [81]:
# set dummies
train_key_d[train_key_d > 0] = 1

In [82]:
train = pd.merge(train, train_key_d, left_index=True, right_index=True, how='left')
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,RX_COMMERCIAL,RX_MANAGED_MEDICAID,RX_MEDICAID,RX_MEDICARE,pract_screen_pct,cbsa_pct,age_pct,state_pct,632,650
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,1,0,0,0,0.851852,0.603501,0.538345,0.556098,,
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,1,0,0,0,1.0,0.705413,0.624289,0.608479,1.0,0.0
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,1,0,0,0,0.777778,0.442985,0.718529,0.526563,,
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1,2,35620,43,293,...,0,1,0,0,1.0,0.730394,0.703938,0.741726,1.0,1.0


In [83]:
test_key_diagnosis = pd.read_csv(fdir+'test_key_diagnosis.csv.gz')
test_key_d = pd.crosstab(test_key_diagnosis.patient_id,test_key_diagnosis.diagnosis_code) 
test_key_d[:5]

diagnosis_code,632,650
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
84548780,0,1
84549024,0,4
84549728,0,2
84549953,0,4
84550351,0,3


In [84]:
# set dummies
test_key_d[test_key_d > 0] = 1

In [85]:
test = pd.merge(test, test_key_d, left_index=True, right_index=True, how='left')

In [86]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge?')

## procedure_code features

In [87]:
train_key_procedure = pd.read_csv(fdir+'train_key_procedure.csv.gz')
train_key_p = pd.crosstab(train_key_procedure.patient_id,train_key_procedure.procedure_code) 
train_key_p[:2]

procedure_code,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
84553111,0,0,0,1,0,0,0,0,0
84553567,1,0,0,0,0,0,0,0,0


In [88]:
# set dummies
train_key_p[train_key_p > 0] = 1

In [89]:
train = pd.merge(train, train_key_p, left_index=True, right_index=True, how='left')
train[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,650,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,51-53,SD,ALL OTHER,UNKNOWN,UNKNOWN,1,37,43940,8,49,...,,,,,,,,,,
94237712,39-41,NE,ALL OTHER,UNKNOWN,UNKNOWN,1,129,36540,20,247,...,0.0,,,,,,,,,
186124512,24-26,CA,ALL OTHER,UNKNOWN,UNKNOWN,0,26,31080,5,18,...,,,,,,,,,,
767144212,27-29,NY,ALL OTHER,UNKNOWN,UNKNOWN,1,2,35620,43,293,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
test_key_procedure = pd.read_csv(fdir+'test_key_procedure.csv.gz')
test_key_p = pd.crosstab(test_key_procedure.patient_id,test_key_procedure.procedure_code) 
test_key_p[:2]

procedure_code,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
84549453,0,1,0,0,0,0,0,0,0
84549821,0,1,0,0,0,0,0,0,0


In [91]:
# set dummies
test_key_p[test_key_p > 0] = 1

In [92]:
test = pd.merge(test, test_key_p, left_index=True, right_index=True, how='left')
test[:4]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,CLINIC,...,650,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148341312,66-68,TX,ALL OTHER,UNKNOWN,UNKNOWN,144,19100,11,135,,...,,,,,,,,,,
130010912,45-47,IN,ALL OTHER,UNKNOWN,UNKNOWN,92,26900,8,85,,...,,,,,,,,,,
103994412,27-29,CA,ALL OTHER,UNKNOWN,UNKNOWN,168,31080,37,417,,...,,,,,,,,,,
318658812,27-29,TN,ALL OTHER,UNKNOWN,UNKNOWN,72,32820,14,150,0.0,...,,,,,,,,,,


In [93]:
if train.shape[0] != original_train_rows:
    raise Exception('bad merge? : '+str(train.shape)+' should be '+str(original_train_rows))

In [94]:
if test.shape[0] != original_test_rows:
    raise Exception('bad merge? : '+str(test.shape)+' should be '+str(original_test_rows))

## Output

In [95]:
train.columns

Index([  u'patient_age_group',       u'patient_state',          u'ethinicity',
          u'household_income',     u'education_level',         u'is_screener',
                    u'visits',                u'cbsa',          u'num_visits',
            u'num_procedures',              u'CLINIC',           u'INPATIENT',
                     u'OTHER',          u'OUTPATIENT',             u'UNKNOWN',
                      u'0001',                u'0002',                u'0003',
                      u'0004',                u'0005',                u'0006',
                      u'HX01',                u'HX02',                u'HX03',
                      u'HX04',                u'HX05',                u'HXPR',
             u'num_diagnosis',              u'num_rx',       u'RX_ASSISTANCE',
                   u'RX_CASH',       u'RX_COMMERCIAL', u'RX_MANAGED_MEDICAID',
               u'RX_MEDICAID',         u'RX_MEDICARE',    u'pract_screen_pct',
                  u'cbsa_pct',             u'age_pct

In [96]:
train.shape, test.shape

((1157817, 50), (1701813, 49))

In [97]:
train.to_csv('./features/train_big_table.csv')

In [98]:
test.to_csv('./features/test_big_table.csv')

In [99]:
train_encoded = train.copy()

In [100]:
# patient_age encode
patient_age_dict = \
{
'24-26': 1,
'27-29': 2,
'30-32': 3,
'36-38': 4,
'39-41': 5,
'42-44': 6,
'45-47': 7,
'48-50': 8,
'33-35': 9,
'51-53': 10,
'54-56': 11,
'57-59': 12,
'60-62': 13,
'63-65': 14,
'66-68': 15,
'69-71': 16
}

train_encoded.patient_age_group  = [ patient_age_dict[i]  for i in train_encoded.patient_age_group.values ]

In [101]:
household_income_dict = {'UNKNOWN': 0,  '<=$49K': 1, '<$50-99K': 2, '$100K+': 3}
train_encoded.household_income  = [ household_income_dict[i]  for i in train_encoded.household_income.values ]

In [102]:
from sklearn.preprocessing import LabelEncoder
le_patient_state = LabelEncoder().fit(train.patient_state.values)
le_ethinicity    = LabelEncoder().fit(train.ethinicity.values)
le_education_level = LabelEncoder().fit(train.education_level.values)
train_encoded.patient_state = le_patient_state.transform(train.patient_state.values)
train_encoded.ethinicity    = le_ethinicity.transform(train.ethinicity.values)
train_encoded.education_level = le_education_level.transform(train.education_level.values)

In [103]:
train_encoded[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,is_screener,visits,cbsa,num_visits,num_procedures,...,650,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
336201912,10,41,1,0,3,1,37,43940,8,49,...,,,,,,,,,,
94237712,5,29,1,0,3,1,129,36540,20,247,...,0.0,,,,,,,,,
186124512,1,4,1,0,3,0,26,31080,5,18,...,,,,,,,,,,


In [104]:
train_encoded.to_csv('./features/train_big_table_encoded.csv')

In [105]:
test_encoded = test.copy()

In [106]:
test_encoded.patient_age_group  = [ patient_age_dict[i]  for i in test_encoded.patient_age_group.values ]
test_encoded.household_income  = [ household_income_dict[i]  for i in test_encoded.household_income.values ]

In [107]:
test_encoded.patient_state = le_patient_state.transform(test.patient_state.values)
test_encoded.ethinicity    = le_ethinicity.transform(test.ethinicity.values)
test_encoded.education_level = le_education_level.transform(test.education_level.values)

In [108]:
test_encoded[:3]

Unnamed: 0_level_0,patient_age_group,patient_state,ethinicity,household_income,education_level,visits,cbsa,num_visits,num_procedures,CLINIC,...,650,57452,57454,57455,57456,81252,90696,G0143,S4020,S4023
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148341312,15,43,1,0,3,144,19100,11,135,,...,,,,,,,,,,
130010912,7,15,1,0,3,92,26900,8,85,,...,,,,,,,,,,
103994412,2,4,1,0,3,168,31080,37,417,,...,,,,,,,,,,


In [109]:
test_encoded.to_csv('./features/test_big_table_encoded.csv')

In [110]:
gc.collect()

674

# DONE !!!

In [111]:
! gzip -f ./features/train_big_table.csv
! gzip -f ./features/test_big_table.csv
! gzip -f ./features/train_big_table_encoded.csv
! gzip -f ./features/test_big_table_encoded.csv