In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df=pd.read_csv('dff.csv')
df.head()

Unnamed: 0,Patient_uid,SiteCode,Region,Gender,LastVisit,EducationLevel,Occupation,MaritalStatus,AgeLastVisit,AgeARTStart,...,ArtAdherence,VisitDate,NextAppointmentDate,NextVisit,Status,TotalVisits,IITPercentage,IITLast3Percentage,Viral Load,PHQ_9_rating
0,537FC85BA9ECD3C17D8E91F468A333FA41D424866D747E...,13657,Kisumu,Male,2023-05-08,SECONDARY,OTHER NON-CODED,SINGLE,12.0,5.0,...,good,2023-05-04,2023-05-09,2023-05-08,Transfer Out,7.0,0.0,0.0,540.0,Unscreened
1,DFC18D10FF868F0161C02027ACBB57E9A4139753CA1AF4...,18463,Nairobi,Female,2022-12-14,NONE,Student,SINGLE,1.0,1.0,...,good,2022-11-03,2022-12-01,2022-12-14,Died,5.0,0.0,0.0,216000.0,Unscreened
2,4E607EDF89FB38AB660919E1D18C4CA0A0CD385DB90E1D...,14634,West Pokot,Female,2023-12-07,NONE,Farmer,WIDOWED,38.0,31.0,...,good,2023-06-08,2023-09-06,2023-08-17,In Treatment,6.0,16.666667,0.0,0.0,Unscreened
3,A453D55812C144933C396AF311A8964415245049B927D3...,15156,Nakuru,Female,2023-10-31,PRIMARY,Farmer,MARRIED MONOGAMOUS,38.0,29.0,...,good,2023-04-26,2023-07-26,2023-07-27,In Treatment,4.0,0.0,0.0,0.0,Depression Unlikely
4,7092117F28568B174200EFD899181234C129FB40D1CC83...,12905,Nairobi,Female,2023-09-11,SECONDARY,Trader,UNKNOWN,58.0,51.0,...,good,2022-12-08,2023-06-12,2023-06-12,In Treatment,3.0,0.0,0.0,0.0,Unscreened


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Patient_uid          20000 non-null  object 
 1   SiteCode             20000 non-null  int64  
 2   Region               20000 non-null  object 
 3   Gender               20000 non-null  object 
 4   LastVisit            20000 non-null  object 
 5   EducationLevel       20000 non-null  object 
 6   Occupation           20000 non-null  object 
 7   MaritalStatus        20000 non-null  object 
 8   AgeLastVisit         20000 non-null  float64
 9   AgeARTStart          20000 non-null  float64
 10  StartRegimen         20000 non-null  object 
 11  LastRegimen          20000 non-null  object 
 12  ArtAdherence         20000 non-null  object 
 13  VisitDate            20000 non-null  object 
 14  NextAppointmentDate  20000 non-null  object 
 15  NextVisit            20000 non-null 

In [10]:
# Define the mapping for risk categories
risk_mapping = {
    'In Treatment': 'Low',
    'IIT': 'High',
    'Transfer Out': 'Medium'
}

# Transform the 'Status' column based on the risk mapping
df['Status'] = df['Status'].map(risk_mapping)

# Drop rows where 'Status' is 'Died'
df = df[df['Status'].notnull()]


In [11]:
df.head()

Unnamed: 0,Patient_uid,SiteCode,Region,Gender,LastVisit,EducationLevel,Occupation,MaritalStatus,AgeLastVisit,AgeARTStart,...,ArtAdherence,VisitDate,NextAppointmentDate,NextVisit,Status,TotalVisits,IITPercentage,IITLast3Percentage,Viral Load,PHQ_9_rating
0,537FC85BA9ECD3C17D8E91F468A333FA41D424866D747E...,13657,Kisumu,Male,2023-05-08,SECONDARY,OTHER NON-CODED,SINGLE,12.0,5.0,...,good,2023-05-04,2023-05-09,2023-05-08,Medium,7.0,0.0,0.0,540.0,Unscreened
2,4E607EDF89FB38AB660919E1D18C4CA0A0CD385DB90E1D...,14634,West Pokot,Female,2023-12-07,NONE,Farmer,WIDOWED,38.0,31.0,...,good,2023-06-08,2023-09-06,2023-08-17,Low,6.0,16.666667,0.0,0.0,Unscreened
3,A453D55812C144933C396AF311A8964415245049B927D3...,15156,Nakuru,Female,2023-10-31,PRIMARY,Farmer,MARRIED MONOGAMOUS,38.0,29.0,...,good,2023-04-26,2023-07-26,2023-07-27,Low,4.0,0.0,0.0,0.0,Depression Unlikely
4,7092117F28568B174200EFD899181234C129FB40D1CC83...,12905,Nairobi,Female,2023-09-11,SECONDARY,Trader,UNKNOWN,58.0,51.0,...,good,2022-12-08,2023-06-12,2023-06-12,Low,3.0,0.0,0.0,0.0,Unscreened
5,C0DCF72FC2CC06001C224652195C69584C618FF1B482A9...,10688,Muranga,Male,2022-08-16,SECONDARY,Farmer,MARRIED MONOGAMOUS,30.0,26.0,...,good,2022-05-12,2022-08-02,2022-08-16,Low,3.0,0.0,0.0,0.0,Unscreened


In [13]:
df.Status.unique()

array(['Medium', 'Low', 'High'], dtype=object)

In [14]:
# Drop irrelevant columns from the main dataframe
df.drop(columns=['Patient_uid', 'EducationLevel','Region','SiteCode', 'LastVisit', 'VisitDate', 'NextAppointmentDate', 'NextVisit'], inplace=True)

In [15]:
# Custom Transformer to convert boolean columns to integers
class BooleanConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        boolean_columns = X.select_dtypes(include='bool').columns
        X[boolean_columns] = X[boolean_columns].astype(int)
        return X
from sklearn.preprocessing import LabelEncoder

# label encoder
label_encoder = LabelEncoder()

# Encoding the target variable
df['Status'] = label_encoder.fit_transform(df['Status'])        

In [16]:
# Select columns for one-hot encoding
columns_to_encode = ['Gender', 'Occupation', 'MaritalStatus', 'StartRegimen', 'LastRegimen', 'ArtAdherence', 'PHQ_9_rating']

# Perform one-hot encoding with boolean dtype
df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)
# Convert boolean values to integers (0 and 1)
df_encoded = df_encoded.astype(int)

In [17]:
df_encoded

Unnamed: 0,AgeLastVisit,AgeARTStart,Status,TotalVisits,IITPercentage,IITLast3Percentage,Viral Load,Gender_Male,Occupation_Driver,Occupation_Employee,...,LastRegimen_TDF+3TC+ATV/r,LastRegimen_TDF+3TC+DTG+DRV/r,LastRegimen_unknown,ArtAdherence_good,ArtAdherence_poor,PHQ_9_rating_Mild Depression,PHQ_9_rating_Moderate Depression,PHQ_9_rating_Moderate Severe Depression,PHQ_9_rating_Severe Depression,PHQ_9_rating_Unscreened
0,12,5,2,7,0,0,540,1,0,0,...,0,0,0,1,0,0,0,0,0,1
2,38,31,1,6,16,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,38,29,1,4,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,58,51,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5,30,26,1,3,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,39,36,1,5,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
19996,31,26,1,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
19997,34,24,1,8,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
19998,58,49,1,2,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
