In [1]:
import pandas as pd

In [2]:
# loading the data

train_data = pd.read_parquet("train.parquet")
train_data.head()

Unnamed: 0,Patient-Uid,Date,Incident
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1


In [3]:
# sorting ascending 

train_data = train_data.sort_values(['Patient-Uid', 'Date'])
train_data.head()

Unnamed: 0,Patient-Uid,Date,Incident
1750087,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2015-09-22,DRUG_TYPE_7
1473893,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-04-13,SYMPTOM_TYPE_2
1387922,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-05-02,DRUG_TYPE_7
223191,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-11-23,SYMPTOM_TYPE_0
557302,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-11-23,DRUG_TYPE_9


In [4]:
# Create binary labels for training (1 for eligible, 0 for not eligible)

train_data['Target'] = (train_data['Incident'] == 'TARGET DRUG').astype(int)
train_data

Unnamed: 0,Patient-Uid,Date,Incident,Target
1750087,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2015-09-22,DRUG_TYPE_7,0
1473893,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-04-13,SYMPTOM_TYPE_2,0
1387922,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-05-02,DRUG_TYPE_7,0
223191,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-11-23,SYMPTOM_TYPE_0,0
557302,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-11-23,DRUG_TYPE_9,0
...,...,...,...,...
26581536,a0f0d582-1c7c-11ec-a6c1-16262ee38c7f,2020-06-19,DRUG_TYPE_6,0
27737944,a0f0d582-1c7c-11ec-a6c1-16262ee38c7f,2020-07-09,TARGET DRUG,1
20027927,a0f0d582-1c7c-11ec-a6c1-16262ee38c7f,2020-07-10,DRUG_TYPE_1,0
14145873,a0f0d582-1c7c-11ec-a6c1-16262ee38c7f,2020-08-05,TARGET DRUG,1


In [5]:
# Feature engineering: extract time-based features from the date column

train_data['Month'] = train_data['Date'].dt.month
train_data['Year'] = train_data['Date'].dt.year

train_data.head()

Unnamed: 0,Patient-Uid,Date,Incident,Target,Month,Year
1750087,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2015-09-22,DRUG_TYPE_7,0,9,2015
1473893,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-04-13,SYMPTOM_TYPE_2,0,4,2018
1387922,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-05-02,DRUG_TYPE_7,0,5,2018
223191,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-11-23,SYMPTOM_TYPE_0,0,11,2018
557302,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-11-23,DRUG_TYPE_9,0,11,2018


In [6]:
# One-hot encoding categorical features

train_data = pd.get_dummies(train_data, columns=['Incident'])

train_data.head(4)

Unnamed: 0,Patient-Uid,Date,Target,Month,Year,Incident_DRUG_TYPE_0,Incident_DRUG_TYPE_1,Incident_DRUG_TYPE_10,Incident_DRUG_TYPE_11,Incident_DRUG_TYPE_12,...,Incident_SYMPTOM_TYPE_7,Incident_SYMPTOM_TYPE_8,Incident_SYMPTOM_TYPE_9,Incident_TARGET DRUG,Incident_TEST_TYPE_0,Incident_TEST_TYPE_1,Incident_TEST_TYPE_2,Incident_TEST_TYPE_3,Incident_TEST_TYPE_4,Incident_TEST_TYPE_5
1750087,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2015-09-22,0,9,2015,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1473893,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-04-13,0,4,2018,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1387922,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-05-02,0,5,2018,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223191,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2018-11-23,0,11,2018,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# creating features for train test split
X = train_data.drop(['Patient-Uid', 'Date', 'Target','Year','Month'],axis=1)
y = train_data['Target']

In [8]:
# Splitting the data into training

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [9]:
# Balancing minority class using oversampling minority class

from imblearn.over_sampling import SMOTE

smt =SMOTE(random_state = 42)
X_smt, y_smt = smt.fit_resample(X_train, y_train)


In [10]:
# Build a predictive model using a random forest classifier

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_smt, y_smt)
preds = model.predict(X_test)

In [11]:
print(preds)

[0 0 0 ... 0 0 0]


In [12]:
# Evaluate the model

from sklearn.metrics import f1_score

f1=f1_score(y_test, preds)
print("F1-Score on validation set:", f1)

F1-Score on validation set: 1.0


In [13]:
# prediction for Test dataset

test_df = pd.read_parquet("test.parquet")
test_df.head(1)

Unnamed: 0,Patient-Uid,Date,Incident
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0


In [15]:
# Encoding Incident feature

test_df = pd.get_dummies(test_df, columns=['Incident'])

In [17]:
test_df.head(2)

Unnamed: 0,Patient-Uid,Date,Incident_DRUG_TYPE_0,Incident_DRUG_TYPE_1,Incident_DRUG_TYPE_10,Incident_DRUG_TYPE_11,Incident_DRUG_TYPE_12,Incident_DRUG_TYPE_13,Incident_DRUG_TYPE_14,Incident_DRUG_TYPE_15,...,Incident_SYMPTOM_TYPE_6,Incident_SYMPTOM_TYPE_7,Incident_SYMPTOM_TYPE_8,Incident_SYMPTOM_TYPE_9,Incident_TEST_TYPE_0,Incident_TEST_TYPE_1,Incident_TEST_TYPE_2,Incident_TEST_TYPE_3,Incident_TEST_TYPE_4,Incident_TEST_TYPE_5
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Adding column features to same column as Train dataset

test_df["Incident_DRUG_TYPE_18"]= 0

In [23]:
test_df["Incident_TARGET DRUG"]= 0

In [28]:
Test_data=test_df.drop(["Patient-Uid","Date"],axis=1)
Test_data.head(2)

Unnamed: 0,Incident_DRUG_TYPE_0,Incident_DRUG_TYPE_1,Incident_DRUG_TYPE_10,Incident_DRUG_TYPE_11,Incident_DRUG_TYPE_12,Incident_DRUG_TYPE_13,Incident_DRUG_TYPE_14,Incident_DRUG_TYPE_15,Incident_DRUG_TYPE_16,Incident_DRUG_TYPE_17,...,Incident_SYMPTOM_TYPE_8,Incident_SYMPTOM_TYPE_9,Incident_TEST_TYPE_0,Incident_TEST_TYPE_1,Incident_TEST_TYPE_2,Incident_TEST_TYPE_3,Incident_TEST_TYPE_4,Incident_TEST_TYPE_5,Incident_TARGET DRUG,Incident_DRUG_TYPE_18
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# test data model building & prediction

preds = model.predict(Test_data)

Feature names must be in the same order as they were in fit.



In [31]:
# prediction values

pred = pd.DataFrame(preds)
pred

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
1065519,0
1065520,0
1065521,0
1065522,0
