# Data Science Project: Persistency of Drug

## Notebook about data clensing and transformation

## Load the dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests as rq
from io import BytesIO

url = "https://raw.githubusercontent.com/EniasVontas/DataSets/main/ntm_dataset.xlsx"
data = rq.get(url).content
data = pd.read_excel(BytesIO(data))

data.info()
data.groupby(["Persistency_Flag"]).Persistency_Flag.count()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424 entries, 0 to 3423
Data columns (total 69 columns):
 #   Column                                                              Non-Null Count  Dtype 
---  ------                                                              --------------  ----- 
 0   Ptid                                                                3424 non-null   object
 1   Persistency_Flag                                                    3424 non-null   object
 2   Gender                                                              3424 non-null   object
 3   Race                                                                3424 non-null   object
 4   Ethnicity                                                           3424 non-null   object
 5   Region                                                              3424 non-null   object
 6   Age_Bucket                                                          3424 non-null   object
 7   Ntm_Speciality          

Persistency_Flag
Non-Persistent    2135
Persistent        1289
Name: Persistency_Flag, dtype: int64

#### Split Dataset
####  Label the target variable and drop ID variable

In [2]:
data = data.drop(['Ptid'],axis=1)
X = data.drop(['Persistency_Flag'],axis=1)

y=data['Persistency_Flag']

#### Split the dataset into Train and Test

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape) 

Train (2739, 67) (2739,)
Test (685, 67) (685,)


In [4]:
# We encode the target variable
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y_train)

le.classes_
y_train=le.transform(y_train)

y_test = le.transform(y_test)

####     Choose 'object' and 'non-object' columns

In [5]:
data_numeric_train = X_train.select_dtypes(exclude=['object'])
data_numeric_test = X_test.select_dtypes(exclude=['object'])
data_obj_train = X_train.select_dtypes(include=['object']).copy()
data_obj_test = X_test.select_dtypes(include=['object']).copy()

#### We keep the indices of the Train and Test sets, because they will be removed later, during Imputation 

In [6]:
indices_train = data_numeric_train.index
indices_test = data_numeric_test.index

#### We check some variables for Unknown values

In [7]:
data_obj_train.groupby(["Ntm_Speciality"]).Ntm_Speciality.count()

Ntm_Speciality
CARDIOLOGY                                                          20
CLINICAL NURSE SPECIALIST                                            1
EMERGENCY MEDICINE                                                   1
ENDOCRINOLOGY                                                      353
GASTROENTEROLOGY                                                     2
GENERAL PRACTITIONER                                              1236
GERIATRIC MEDICINE                                                   2
HEMATOLOGY & ONCOLOGY                                               11
HOSPICE AND PALLIATIVE MEDICINE                                      2
NEPHROLOGY                                                           3
NEUROLOGY                                                            1
NUCLEAR MEDICINE                                                     1
OBSTETRICS & OBSTETRICS & GYNECOLOGY & OBSTETRICS & GYNECOLOGY       1
OBSTETRICS AND GYNECOLOGY                                     

In [8]:
data_obj_train.groupby(["Ntm_Specialist_Flag"]).Ntm_Specialist_Flag.count()

Ntm_Specialist_Flag
Others        1633
Specialist    1106
Name: Ntm_Specialist_Flag, dtype: int64

In [9]:
data_obj_train.groupby(["Ntm_Speciality_Bucket"]).Ntm_Speciality_Bucket.count()

Ntm_Speciality_Bucket
Endo/Onc/Uro                  556
OB/GYN/Others/PCP/Unknown    1706
Rheum                         477
Name: Ntm_Speciality_Bucket, dtype: int64

In [10]:
data_obj_train.groupby(["Adherent_Flag"]).Adherent_Flag.count()

Adherent_Flag
Adherent        2599
Non-Adherent     140
Name: Adherent_Flag, dtype: int64

In [11]:
data_obj_train.groupby(["Tscore_Bucket_During_Rx"]).Tscore_Bucket_During_Rx.count()

Tscore_Bucket_During_Rx
<=-2.5      805
>-2.5       711
Unknown    1223
Name: Tscore_Bucket_During_Rx, dtype: int64

In [12]:
data_obj_train.groupby(["Change_T_Score"]).Change_T_Score.count()

Change_T_Score
Improved       77
No change    1299
Unknown      1223
Worsened      140
Name: Change_T_Score, dtype: int64

In [13]:
data_obj_train.groupby(["Change_Risk_Segment"]).Change_Risk_Segment.count()

Change_Risk_Segment
Improved       17
No change     820
Unknown      1802
Worsened      100
Name: Change_Risk_Segment, dtype: int64

#### We drop the two columns, because Count of Risks is a linear combination of all other Risk variables
#### and Change in Risk Segment has about 65% missing values

In [14]:
data_numeric_train = data_numeric_train.drop('Count_Of_Risks',axis=1)
data_numeric_test = data_numeric_test.drop('Count_Of_Risks',axis=1)

data_obj_train = data_obj_train.drop(['Change_Risk_Segment'],axis=1)

data_obj_test = data_obj_test.drop(['Change_Risk_Segment'],axis=1)

## Variable Treatment, Missing Values and Imputation

### Scale Numerical Variable in both Train and Test sets

In [15]:

from sklearn.preprocessing import RobustScaler

transformer = RobustScaler().fit(data_numeric_train)
data_numeric_train = transformer.transform(data_numeric_train)

data_numeric_train = pd.DataFrame(data_numeric_train)
data_numeric_train.columns = ['Dexa_Freq_During_Rx']
data_numeric_train.index = indices_train

In [16]:
data_numeric_train.describe()

Unnamed: 0,Dexa_Freq_During_Rx
count,2739.0
mean,0.988317
std,2.766387
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,48.666667


In [17]:

data_numeric_test = transformer.transform(data_numeric_test)

data_numeric_test = pd.DataFrame(data_numeric_test)
data_numeric_test.columns = ['Dexa_Freq_During_Rx']
data_numeric_test.index = indices_test

In [18]:
data_numeric_test.describe()

Unnamed: 0,Dexa_Freq_During_Rx
count,685.0
mean,1.073479
std,2.48438
min,0.0
25%,0.0
50%,0.0
75%,1.333333
max,29.333333


### We save the columns that will be One Hot Encoded

In [19]:
cols = ["Gender","Gluco_Record_Prior_Ntm","Gluco_Record_During_Rx","Dexa_During_Rx","Frag_Frac_Prior_Ntm",
        "Frag_Frac_During_Rx","Adherent_Flag","Race","Ethnicity","Region","Age_Bucket",
        'Ntm_Specialist_Flag','Ntm_Speciality_Bucket',"Risk_Segment_Prior_Ntm","Tscore_Bucket_Prior_Ntm",
        "Idn_Indicator","Injectable_Experience_During_Rx","Comorb_Personal_History_Of_Other_Diseases_And_Conditions",
        'Comorb_Encounter_For_Screening_For_Malignant_Neoplasms','Comorb_Encounter_For_Immunization',
       'Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx',
       'Comorb_Vitamin_D_Deficiency','Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified',
       'Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx','Comorb_Long_Term_Current_Drug_Therapy',
        'Comorb_Dorsalgia','Comorb_Other_Disorders_Of_Bone_Density_And_Structure',
       'Comorb_Disorders_of_lipoprotein_metabolism_and_other_lipidemias',
       'Comorb_Osteoporosis_without_current_pathological_fracture','Comorb_Personal_history_of_malignant_neoplasm',
       'Comorb_Gastro_esophageal_reflux_disease',"Concom_Cephalosporins",
       'Concom_Cholesterol_And_Triglyceride_Regulating_Preparations','Concom_Narcotics', 'Concom_Systemic_Corticosteroids_Plain',
       'Concom_Anti_Depressants_And_Mood_Stabilisers','Concom_Fluoroquinolones','Concom_Macrolides_And_Similar_Types',
       'Concom_Broad_Spectrum_Penicillins', 'Concom_Anaesthetics_General',
       'Concom_Viral_Vaccines', 'Risk_Type_1_Insulin_Dependent_Diabetes',
       'Risk_Osteogenesis_Imperfecta', 'Risk_Rheumatoid_Arthritis',"Risk_Untreated_Chronic_Hypogonadism",
       'Risk_Untreated_Chronic_Hyperthyroidism','Risk_Untreated_Early_Menopause',
       'Risk_Patient_Parent_Fractured_Their_Hip', 'Risk_Smoking_Tobacco',
       'Risk_Chronic_Malnutrition_Or_Malabsorption','Risk_Chronic_Liver_Disease', 'Risk_Family_History_Of_Osteoporosis',
       'Risk_Low_Calcium_Intake', 'Risk_Vitamin_D_Insufficiency',"Risk_Excessive_Thinness",
       'Risk_Poor_Health_Frailty','Risk_Hysterectomy_Oophorectomy', 'Risk_Estrogen_Deficiency',
       'Risk_Immobilization', 'Risk_Recurring_Falls']

In [20]:
from feature_engine.encoding import OneHotEncoder

encoder = OneHotEncoder( variables=cols, drop_last=True)
encoder.fit(data_obj_train)
data_obj_train = encoder.transform(data_obj_train)

data_obj_test = encoder.transform(data_obj_test)

####   We 'onehot encoding' Ntm speciality variable. 'Unknown' category becomes variable. We change the 1 values of unknown category with np.nan  for all other speciality categories so that we can impute them later

In [21]:
encoder = OneHotEncoder( variables="Ntm_Speciality", drop_last=True)
encoder.fit(data_obj_train)
data_obj_train = encoder.transform(data_obj_train)

data_obj_test = encoder.transform(data_obj_test)


col = data_obj_train.columns[72:76]
col=col.append(data_obj_train.columns[78:103])

data_obj_train.loc[data_obj_train["Ntm_Speciality_Unknown"]==1,col] = np.nan
data_obj_train=data_obj_train.drop(["Ntm_Speciality_Unknown"],axis=1)


col = data_obj_test.columns[72:76]
col=col.append(data_obj_test.columns[78:103])
data_obj_test.loc[data_obj_test["Ntm_Speciality_Unknown"]==1,col] = np.nan
data_obj_test=data_obj_test.drop(["Ntm_Speciality_Unknown"],axis=1)


### We ordinal encode the following variables so as to impute the later

In [22]:
from sklearn.preprocessing import OrdinalEncoder


oe = OrdinalEncoder(categories=[['>-2.5','<=-2.5','Unknown']])
oe.fit(data_obj_train[['Tscore_Bucket_During_Rx']])
data_obj_train[['Tscore_Bucket_During_Rx']] = oe.transform(data_obj_train[['Tscore_Bucket_During_Rx']])

data_obj_test[['Tscore_Bucket_During_Rx']] = oe.transform(data_obj_test[['Tscore_Bucket_During_Rx']])


data_obj_train['Tscore_Bucket_During_Rx'] = data_obj_train.Tscore_Bucket_During_Rx.map({0:0,1:1,2:np.nan})
data_obj_test['Tscore_Bucket_During_Rx'] = data_obj_test.Tscore_Bucket_During_Rx.map({0:0,1:1,2:np.nan})
data_obj_train.groupby(['Tscore_Bucket_During_Rx']).Tscore_Bucket_During_Rx.count()

Tscore_Bucket_During_Rx
0.0    711
1.0    805
Name: Tscore_Bucket_During_Rx, dtype: int64

In [23]:
oe = OrdinalEncoder(categories=[['Worsened','No change','Improved','Unknown']])
oe.fit(data_obj_train[['Change_T_Score']])
data_obj_train[['Change_T_Score']] = oe.transform(data_obj_train[['Change_T_Score']])

data_obj_train['Change_T_Score'] = data_obj_train.Change_T_Score.map({0:0,1:1,2:2,3:np.nan})
data_obj_test[['Change_T_Score']] = oe.transform(data_obj_test[['Change_T_Score']])

data_obj_test['Change_T_Score'] = data_obj_test.Change_T_Score.map({0:0,1:1,2:2,3:np.nan})
data_obj_train.groupby(['Change_T_Score']).Change_T_Score.count()

Change_T_Score
0.0     140
1.0    1299
2.0      77
Name: Change_T_Score, dtype: int64

In [24]:
oe = OrdinalEncoder(categories=[['VLR_LR','HR_VHR','Unknown']])
oe.fit(data_obj_train[['Risk_Segment_During_Rx']])
data_obj_train[['Risk_Segment_During_Rx']] = oe.transform(data_obj_train[['Risk_Segment_During_Rx']])
data_obj_test[['Risk_Segment_During_Rx']] = oe.transform(data_obj_test[['Risk_Segment_During_Rx']])

data_obj_test['Risk_Segment_During_Rx'] = data_obj_test.Risk_Segment_During_Rx.map({0:0,1:1,2:np.nan})

data_obj_train['Risk_Segment_During_Rx'] = data_obj_train.Risk_Segment_During_Rx.map({0:0,1:1,2:np.nan})
data_obj_train.groupby(['Risk_Segment_During_Rx']).Risk_Segment_During_Rx.count()

Risk_Segment_During_Rx
0.0    753
1.0    763
Name: Risk_Segment_During_Rx, dtype: int64

## Imputation with KNNImpute method

In [25]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors = 1)
imputer.fit(data_obj_train)
data_obj_train = pd.DataFrame(imputer.transform(data_obj_train),columns=data_obj_train.columns)

data_obj_test = pd.DataFrame(imputer.transform(data_obj_test),columns=data_obj_test.columns)

# we reset indices, because they are lost after imputation
data_obj_train.index = indices_train
data_obj_test.index = indices_test

## We reorganise our dataset

In [26]:
X_train = pd.concat([data_numeric_train,data_obj_train],axis=1)
X_test = pd.concat([data_numeric_test,data_obj_test],axis=1,verify_integrity=True)

In [27]:
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape) 

Train (2739, 103) (2739,)
Test (685, 103) (685,)


In [28]:
X_train.head()

Unnamed: 0,Dexa_Freq_During_Rx,Risk_Segment_During_Rx,Tscore_Bucket_During_Rx,Change_T_Score,Gender_Female,Gluco_Record_Prior_Ntm_Y,Gluco_Record_During_Rx_N,Dexa_During_Rx_Y,Frag_Frac_Prior_Ntm_Y,Frag_Frac_During_Rx_N,...,Ntm_Speciality_PSYCHIATRY AND NEUROLOGY,Ntm_Speciality_PATHOLOGY,Ntm_Speciality_OTOLARYNGOLOGY,Ntm_Speciality_PULMONARY MEDICINE,Ntm_Speciality_PAIN MEDICINE,Ntm_Speciality_GASTROENTEROLOGY,Ntm_Speciality_ORTHOPEDICS,Ntm_Speciality_RADIOLOGY,Ntm_Speciality_PODIATRY,Ntm_Speciality_EMERGENCY MEDICINE
445,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2570,8.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
627,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1785,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
909,3.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
X_test.head()

Unnamed: 0,Dexa_Freq_During_Rx,Risk_Segment_During_Rx,Tscore_Bucket_During_Rx,Change_T_Score,Gender_Female,Gluco_Record_Prior_Ntm_Y,Gluco_Record_During_Rx_N,Dexa_During_Rx_Y,Frag_Frac_Prior_Ntm_Y,Frag_Frac_During_Rx_N,...,Ntm_Speciality_PSYCHIATRY AND NEUROLOGY,Ntm_Speciality_PATHOLOGY,Ntm_Speciality_OTOLARYNGOLOGY,Ntm_Speciality_PULMONARY MEDICINE,Ntm_Speciality_PAIN MEDICINE,Ntm_Speciality_GASTROENTEROLOGY,Ntm_Speciality_ORTHOPEDICS,Ntm_Speciality_RADIOLOGY,Ntm_Speciality_PODIATRY,Ntm_Speciality_EMERGENCY MEDICINE
2543,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2243,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1673,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
551,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2433,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
