In [304]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
import xgboost as xgb

## Importing Data

In [305]:
df = pd.read_csv('./data/train.csv')

In [306]:
df.head()

Unnamed: 0,enc_id,patient_id,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmission_id
0,88346340,2488608,Caucasian,Male,[60-70),,1,2,6,3,...,No,Steady,No,No,No,No,No,Ch,Yes,2
1,92001408,52133202,Caucasian,Male,[70-80),[100-125),2,6,1,7,...,No,No,No,No,No,No,No,No,Yes,1
2,169424316,40945509,Caucasian,Female,[70-80),,3,2,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,1
3,272987082,38850777,Caucasian,Female,[50-60),,1,1,7,1,...,No,No,No,No,No,No,No,No,Yes,2
4,150600612,72738225,Caucasian,Female,[80-90),,1,6,7,6,...,No,Down,No,No,No,No,No,Ch,Yes,2


In [307]:
categorical_col = [col for col in df.columns 
                   if df[col].dtype == 'object' 
                   or df[col].dtype == 'bool' or df[col].dtype == 'category']
numerical_col = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
dropped_cols = set(df.columns) - set(categorical_col) - set(numerical_col)

In [308]:
print(len(df.columns), len(categorical_col) + len(numerical_col))

50 50


In [309]:
print(categorical_col)
print(numerical_col)

['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']
['enc_id', 'patient_id', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmission_id']


In [310]:
def giveNullCounts(df, columns):
    for col in columns:
        if col not in dropped_cols:
            if df[col].isnull().sum() > 0:
                print(col, df[col].isnull().sum())

In [311]:
giveNullCounts(df, numerical_col)

In [312]:
giveNullCounts(df, categorical_col)

race 1621
weight 68986
payer_code 28178
medical_specialty 34930
diag_1 15
diag_2 244
diag_3 989
max_glu_serum 67515
A1Cresult 59356


## Handling Missing Values

In [313]:
# fill race null values with 'Other'
df['race'] = df['race'].replace(np.nan, 'Other')

In [314]:
df.drop(['weight', 'payer_code', 'medical_specialty'], inplace=True, axis=1)
# add these columns to dropped_cols
dropped_cols.update(['weight', 'payer_code', 'medical_specialty'])

In [315]:
giveNullCounts(df, categorical_col)

diag_1 15
diag_2 244
diag_3 989
max_glu_serum 67515
A1Cresult 59356


In [316]:
df.drop(['diag_1', 'diag_2', 'diag_3'], inplace=True, axis=1)
# add these columns to dropped_cols
dropped_cols.update(['diag_1', 'diag_2', 'diag_3'])

In [317]:
df['max_glu_serum'] = df['max_glu_serum'].replace(np.nan, 'None')
df['A1Cresult'] = df['A1Cresult'].replace(np.nan, 'None')

In [318]:
giveNullCounts(df, categorical_col)

## Label Encoding / One-Hot Encoding / Feature Engineering

In [319]:
drugs = ['metformin', 'repaglinide','glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone','insulin','citoglipton','nateglinide','chlorpropamide','acarbose','miglitol','glyburide-metformin', 'tolazamide','metformin-pioglitazone', 'metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin','troglitazone', 'tolbutamide', 'acetohexamide','examide']

In [320]:
df["number_of_no"] = 0
df["number_of_steady"] = 0
df["number_of_up"] = 0
df["number_of_down"] = 0

for drug in drugs:
    df["number_of_no"] += (df[drug] == 'No').astype(int)
    df["number_of_steady"] += (df[drug] == 'Steady').astype(int)
    df["number_of_up"] += (df[drug] == 'Up').astype(int)
    df["number_of_down"] += (df[drug] == 'Down').astype(int)

In [321]:
df.drop(drugs, inplace=True, axis=1)
# add these columns to dropped_cols
dropped_cols.update(drugs)

In [322]:
# new_cat_cols = categorical_col - dropped_cols
new_cat_cols = list(set(categorical_col) - dropped_cols)
for col in new_cat_cols:
    print(col)

race
change
A1Cresult
diabetesMed
max_glu_serum
age
gender


In [323]:
# drop gender rows with value Unknown/Invalid
df = df[df['gender'] != 'Unknown/Invalid']

In [324]:
# use one-hot encoding from pandas for new_cat_cols
# also drop_first = True to avoid dummy trap
# df = pd.get_dummies(df, columns=new_cat_cols, drop_first=True)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')  # Use drop='first' to handle multicollinearity

data_to_encode = df[new_cat_cols]
encoded_data = encoder.fit_transform(data_to_encode)
encoded_column_names = encoder.get_feature_names_out(input_features=new_cat_cols)

df = df.drop(columns=new_cat_cols)  # Drop the original columns
df[encoded_column_names] = encoded_data  # Add the one-hot encoded columns



## Feature Engineering For Patient ID

In [325]:
tmp_test_df = pd.read_csv('./data/test.csv')
lst = pd.concat([df['patient_id'], tmp_test_df['patient_id']]).tolist()
pat = {}
for i in lst:
    if i in pat:
        pat[i] += 1
    else:
        pat[i] = 1

In [277]:
# on = [i for i in pat if pat[i] == 1]
# tw = [i for i in pat if pat[i] == 2]
# mrth = [i for i in pat if pat[i] > 2]

In [326]:
def calc_bin(id):
    return pat[id]
    # if id in on:
    #     return 1
    # elif id in tw:
    #     return 2
    # elif id in mrth:
    #     return 3
    # else:
    #     return 0

In [327]:
df['new_patient_id'] = df['patient_id'].apply(lambda x: calc_bin(x)).astype(int)
df['new_patient_id'].value_counts()

new_patient_id
1     38422
2     14541
3      6965
4      3966
5      2503
6      1471
7       993
8       620
9       446
10      308
12      161
11      141
13      126
15       94
20       86
18       78
23       58
14       48
19       41
16       41
22       33
17       31
40       26
28       21
21       13
Name: count, dtype: int64

## Feature Selection

In [328]:
def give_corr_features(corr_matrix: pd.DataFrame, threshold = 0.9):
    corr_features = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                corr_features.add(corr_matrix.columns[i])

    return corr_features

In [329]:
give_corr_features(df.corr(), threshold=0.85)
# ! so we don't have any highly correlated features

{'number_of_steady'}

## Outlier Detection

In [330]:
# df.plot(kind='box', subplots=True, layout=(5,4), sharex=False, sharey=False, figsize=(10,10))
# plt.show()

## Trying Things - Can Comment Out If Doesn't work

In [331]:
# interactionTerms = [
#     ('num_medications','time_in_hospital'),
#     ('num_medications','num_procedures'),
#     ('time_in_hospital','num_lab_procedures'),
#     ('num_medications','num_lab_procedures'),
#     ('num_medications','number_diagnoses'),
#     # ('age','number_diagnoses'), # ! age not present
#     # ('change','num_medications'), # ! change not present
#     ('number_diagnoses','time_in_hospital')
# ]

In [332]:
# for inter in interactionTerms:
#     name = inter[0] + '|' + inter[1]
#     df[name] = df[inter[0]] * df[inter[1]]

In [333]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71233 entries, 0 to 71235
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   enc_id                    71233 non-null  int64  
 1   patient_id                71233 non-null  int64  
 2   admission_type_id         71233 non-null  int64  
 3   discharge_disposition_id  71233 non-null  int64  
 4   admission_source_id       71233 non-null  int64  
 5   time_in_hospital          71233 non-null  int64  
 6   num_lab_procedures        71233 non-null  int64  
 7   num_procedures            71233 non-null  int64  
 8   num_medications           71233 non-null  int64  
 9   number_outpatient         71233 non-null  int64  
 10  number_emergency          71233 non-null  int64  
 11  number_inpatient          71233 non-null  int64  
 12  number_diagnoses          71233 non-null  int64  
 13  readmission_id            71233 non-null  int64  
 14  number_of_n

## Splitting Data

In [334]:
from sklearn.model_selection import train_test_split

In [335]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('readmission_id', axis=1), 
    df['readmission_id'], test_size=0.2, random_state=42)

In [336]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(56986, 40) (56986,)
(14247, 40) (14247,)


## Model Building

In [337]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier

In [338]:
# models = [
#     ('random_forest', RandomForestClassifier()),
#     ('gradient_boosting', GradientBoostingClassifier()),
#     ('xgboost', XGBClassifier()),
#     ('catboost', CatBoostClassifier())
# ]

### Creating Pipeline
Right now we don't need to do any preprocessing, but we will create a pipeline for future use.

In [339]:
# from sklearn.pipeline import Pipeline


# def createPipeline(model_name, model) -> Pipeline:
#     """
#     We can add more steps to the pipeline if you want
#     """
#     return Pipeline([
#         (model_name, model)
#     ])

### Model selection using K-Fold Cross Validation
`N_SPLITS`: we will use 5-fold cross-validation. You can try different values for N_SPLITS and see how it affects the results.

**K Fold Validation**: Use StratifiedKFold for classification problems and KFold for regression problems. `N_SPLITS` is the number of folds. shuffle is used to shuffle the data before splitting into batches. random_state is used to set the seed for random shuffling.

In [340]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [341]:
# N_SPLITS = 5 # ! number of KFold splits
# stratified_k_fold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

In [342]:
# best_accuracy = float('-inf')
# best_model_pipeline = None
# best_model_name = None

# for model_name, model in models:
#     print("-"*40)
#     print(f"Training MODEL: {model_name}")
#     print("-"*40)

#     accuracy_scores = [] # ! will use this to find average accuracy score on validation set

#     for fold, (train_index, val_index) in enumerate(stratified_k_fold.split(X_train, y_train)):
#         print(f"Fold: {fold+1}/{N_SPLITS}")

#         # ! get training and validation set using the fold indices
#         X_train_fold = X_train.iloc[train_index]
#         X_val_fold = X_train.iloc[val_index]
#         y_train_fold = y_train.iloc[train_index]
#         y_val_fold = y_train.iloc[val_index]


#         # ! pipeline.fit(X, y) -> will first call steps in pipeline and then model.fit(X, y) for us
#         pipeline = createPipeline(model_name, model)

#         # ! fit the pipeline on training data
#         pipeline.fit(X_train_fold, y_train_fold)

#         # ! get predictions on validation set
#         y_pred_val = pipeline.predict(X_val_fold)

#         # ! calculate validation accuracy score
#         accuracy_val = accuracy_score(y_val_fold, y_pred_val)
#         accuracy_scores.append(accuracy_val)

#         print(f"Validation Accuracy Score: {accuracy_val:.4f}")
#         print()

#     average_accuracy_score = np.mean(accuracy_scores)
#     print(f"Average Validation Accuracy Score: {average_accuracy_score:.4f}")

#     # ! select best model based on average validation loss
#     if average_accuracy_score > best_accuracy:
#         best_accuracy = average_accuracy_score
#         best_model_pipeline = pipeline
#         best_model_name = model_name

# print("-"*40)
# print(f"Best Validation Accuracy Score: {best_accuracy:.4f}")
# print(f"Best model: {best_model_name}")
        

### Using GridSearchCV to find the best parameters

In [343]:
# from sklearn.model_selection import GridSearchCV

# # ! we can add more parameters to the grid search
# params = {
#     "max_depth": [300, 500, 700, 800],
#     "max_leaf_nodes": [1000, 1300, 1600]
# }

# grid_search = GridSearchCV(
#     estimator=RandomForestClassifier(),
#     param_grid=params,
#     scoring="accuracy",
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# grid_search.fit(X_train, y_train)

In [344]:
# grad_boost = GradientBoostingClassifier()
# grad_boost.fit(X_train, y_train)
# model = grid_search.best_estimator_
# model = RandomForestClassifier()
# model = GradientBoostingClassifier()
model = RandomForestClassifier()
model.fit(X_train, y_train)

### Grouping by Patient ID

## Model Evaluation

In [345]:
y_pred = model.predict(X_test)
print("Accuracy is {0:.2f}".format(accuracy_score(y_test, y_pred)))
print("micro: {:.2f}".format(metrics.precision_score(y_test, y_pred, average='micro')))
print("macro: {:.2f} ".format( metrics.precision_score(y_test, y_pred, average='macro')))
print("weighted: {:.2f} ".format( metrics.precision_score(y_test, y_pred, average='weighted'))) 
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy is 0.72
micro: 0.72
macro: 0.66 
weighted: 0.71 
              precision    recall  f1-score   support

           0       0.55      0.06      0.10      1581
           1       0.63      0.69      0.66      5036
           2       0.79      0.89      0.83      7630

    accuracy                           0.72     14247
   macro avg       0.66      0.54      0.53     14247
weighted avg       0.71      0.72      0.69     14247

[[  87 1168  326]
 [  58 3457 1521]
 [  13  832 6785]]


## Submission File

In [346]:
test_df = pd.read_csv('./data/test.csv')

In [347]:
test_df['race'] = test_df['race'].replace(np.nan, 'Other')
test_df.drop(['weight', 'payer_code', 'medical_specialty'], inplace=True, axis=1)
test_df.drop(['diag_1', 'diag_2', 'diag_3'], inplace=True, axis=1)
test_df['max_glu_serum'] = test_df['max_glu_serum'].replace(np.nan, 'None')
test_df['A1Cresult'] = test_df['A1Cresult'].replace(np.nan, 'None')

In [348]:
test_df["number_of_no"] = 0
test_df["number_of_steady"] = 0
test_df["number_of_up"] = 0
test_df["number_of_down"] = 0

for drug in drugs:
    test_df["number_of_no"] += (test_df[drug] == 'No').astype(int)
    test_df["number_of_steady"] += (test_df[drug] == 'Steady').astype(int)
    test_df["number_of_up"] += (test_df[drug] == 'Up').astype(int)
    test_df["number_of_down"] += (test_df[drug] == 'Down').astype(int)

In [349]:
test_df.drop(drugs, inplace=True, axis=1)

In [350]:
# drop gender rows with value Unknown/Invalid
test_df = test_df[test_df['gender'] != 'Unknown/Invalid']

In [351]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')  # Use drop='first' to handle multicollinearity

data_to_encode = test_df[new_cat_cols]
encoded_data = encoder.fit_transform(data_to_encode)
encoded_column_names = encoder.get_feature_names_out(input_features=new_cat_cols)

test_df = test_df.drop(columns=new_cat_cols)  # Drop the original columns
test_df[encoded_column_names] = encoded_data



### Feature Engineering for Patient ID

In [352]:
# lst = test_df['patient_id'].to_list()
# pat = {}
# for i in lst:
#     if i in pat:
#         pat[i] += 1
#     else:
#         pat[i] = 1
# on = [i for i in pat if pat[i] == 1]
# tw = [i for i in pat if pat[i] == 2]
# mrth = [i for i in pat if pat[i] > 2]
# def calc_bin(id):
#     return pat[id]
    # if id in on:
    #     return 1
    # elif id in tw:
    #     return 2
    # elif id in mrth:
    #     return 3
    # else:
    #     return 0
test_df['new_patient_id'] = test_df['patient_id'].apply(lambda x: calc_bin(x)).astype(int)

In [353]:
# for inter in interactionTerms:
#     name = inter[0] + '|' + inter[1]
#     test_df[name] = test_df[inter[0]] * test_df[inter[1]]

In [354]:
test_pred = model.predict(test_df)

In [None]:
result_df = pd.DataFrame(test_pred)
result_df['readmission_id']=test_pred
result_df['enc_id']=test_df['enc_id']
cvfile = pd.concat((result_df["enc_id"], result_df["readmission_id"]), axis=1)
cvfile.to_csv('finale.csv', index=False)