# Data overview

In [32]:
# Import necessary libraries
import pandas as pd

# Load the dataset
file_path = '/content/train.csv'
df = pd.read_csv(file_path)

# Display basic information and a preview of the data for EDA
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      28800 non-null  int64  
 1   dri_score               28646 non-null  object 
 2   psych_disturb           26738 non-null  object 
 3   cyto_score              20732 non-null  object 
 4   diabetes                26681 non-null  object 
 5   hla_match_c_high        24180 non-null  float64
 6   hla_high_res_8          22971 non-null  float64
 7   tbi_status              28800 non-null  object 
 8   arrhythmia              26598 non-null  object 
 9   hla_low_res_6           25530 non-null  float64
 10  graft_type              28800 non-null  object 
 11  vent_hist               28541 non-null  object 
 12  renal_issue             26885 non-null  object 
 13  pulm_severe             26665 non-null  object 
 14  prim_disease_hct        28800 non-null

In [33]:
# Summary statistics for numerical and categorical columns
df.describe(include='all')

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
count,28800.0,28646,26738,20732,26681,24180.0,22971.0,28800,26598,25530.0,...,17404,28642,27395,25147.0,26258,25448.0,26753,23736.0,28800.0,28800.0
unique,,11,3,7,3,,,8,3,,...,4,3,2,,3,,3,,,
top,,Intermediate,No,Poor,No,,,No TBI,No,,...,Permissive mismatched,Related,"N/A, Mel not given",,No,,No,,,
freq,,10436,23005,8802,22201,,,18861,25203,,...,12936,16208,20135,,24592,,21338,,,
mean,14399.5,,,,,1.764516,6.876801,,,5.143322,...,,,,6.903448,,1.707128,,8.664687,0.539306,23.237678
std,8313.988213,,,,,0.431941,1.564313,,,1.207757,...,,,,1.565017,,0.461179,,1.882746,0.498461,24.799748
min,0.0,,,,,0.0,2.0,,,2.0,...,,,,2.0,,0.0,,4.0,0.0,0.333
25%,7199.75,,,,,2.0,6.0,,,4.0,...,,,,6.0,,1.0,,7.0,0.0,5.61975
50%,14399.5,,,,,2.0,8.0,,,6.0,...,,,,8.0,,2.0,,10.0,1.0,9.7965
75%,21599.25,,,,,2.0,8.0,,,6.0,...,,,,8.0,,2.0,,10.0,1.0,35.1


In [34]:
df.isnull().sum()

Unnamed: 0,0
ID,0
dri_score,154
psych_disturb,2062
cyto_score,8068
diabetes,2119
hla_match_c_high,4620
hla_high_res_8,5829
tbi_status,0
arrhythmia,2202
hla_low_res_6,3270


In [35]:
df.head()  # First few rows of the dataset

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


# Data Cleaning and Encoding


In [36]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()

# Handle missing values (imputation strategy: mean for numeric, mode for categorical)
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna(df[column].mode()[0])  # Fill with mode for categorical
    else:
        df[column] = df[column].fillna(df[column].mean())  # Fill with mean for numerical

# mode:  replaces missing values with the most frequent value of the feature
# mean: replaces missing values with the average value of the entire column of the feature

missing_values

Unnamed: 0,0
ID,0
dri_score,154
psych_disturb,2062
cyto_score,8068
diabetes,2119
hla_match_c_high,4620
hla_high_res_8,5829
tbi_status,0
arrhythmia,2202
hla_low_res_6,3270


In [37]:
#Seperate Categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Encoding categorical variables using one-hot encoding
encoded_df = pd.get_dummies(df, columns = categorical_columns, drop_first=True)

# Ensure all boolean values are converted to integers (1/0)
encoded_df = encoded_df.astype(int)

# Final check for missing values and dataset preview
encoded_df.head()


Unnamed: 0,ID,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,...,tce_div_match_GvH non-permissive,tce_div_match_HvG non-permissive,tce_div_match_Permissive mismatched,donor_related_Related,donor_related_Unrelated,"melphalan_dose_N/A, Mel not given",cardiac_Not done,cardiac_Yes,pulm_moderate_Not done,pulm_moderate_Yes
0,0,1,6,6,6,8,2,6,2,2,...,0,0,1,0,1,1,0,0,0,0
1,1,2,8,6,6,10,2,6,2,2,...,0,0,1,1,0,1,0,0,0,1
2,2,2,8,6,6,10,2,6,2,2,...,0,0,1,1,0,1,0,0,0,0
3,3,2,8,6,6,10,2,6,2,2,...,0,0,1,0,1,1,0,0,0,0
4,4,2,8,6,6,10,2,5,2,2,...,0,0,1,1,0,0,0,0,0,0


In [38]:
with pd.option_context('display.max_columns', None):
    print(encoded_df.head())

   ID  hla_match_c_high  hla_high_res_8  hla_low_res_6  hla_high_res_6  \
0   0                 1               6              6               6   
1   1                 2               8              6               6   
2   2                 2               8              6               6   
3   3                 2               8              6               6   
4   4                 2               8              6               6   

   hla_high_res_10  hla_match_dqb1_high  hla_nmdp_6  hla_match_c_low  \
0                8                    2           6                2   
1               10                    2           6                2   
2               10                    2           6                2   
3               10                    2           6                2   
4               10                    2           5                2   

   hla_match_drb1_low  hla_match_dqb1_low  year_hct  hla_match_a_high  \
0                   2                   2      20