In [10]:
path = r"C:\Users\Data Professor\Desktop\Projects\Kidney Project\Chronic_Kidney_Dsease_data.csv"

In [11]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [12]:
def import_csv(url):
    import pandas as pd
    file = url
    pd.set_option("display.max_columns", None)
    pd.set_option("display.max_rows", None)
    data = pd.read_csv(file)
    return data

df = import_csv(path)

In [13]:
def config_data(data):
    
    """
    Configuring the columns in the dataframe
    """
    
    data.columns = data.columns.str.strip()
    data.columns = data.columns.str.lower()
    data = data.copy()
    return data
df = config_data(df)


In [14]:
df.columns

Index(['patientid', 'age', 'gender', 'ethnicity', 'socioeconomicstatus',
       'educationlevel', 'bmi', 'smoking', 'alcoholconsumption',
       'physicalactivity', 'dietquality', 'sleepquality',
       'familyhistorykidneydisease', 'familyhistoryhypertension',
       'familyhistorydiabetes', 'previousacutekidneyinjury',
       'urinarytractinfections', 'systolicbp', 'diastolicbp',
       'fastingbloodsugar', 'hba1c', 'serumcreatinine', 'bunlevels', 'gfr',
       'proteininurine', 'acr', 'serumelectrolytessodium',
       'serumelectrolytespotassium', 'serumelectrolytescalcium',
       'serumelectrolytesphosphorus', 'hemoglobinlevels', 'cholesteroltotal',
       'cholesterolldl', 'cholesterolhdl', 'cholesteroltriglycerides',
       'aceinhibitors', 'diuretics', 'nsaidsuse', 'statins',
       'antidiabeticmedications', 'edema', 'fatiguelevels', 'nauseavomiting',
       'musclecramps', 'itching', 'qualityoflifescore', 'heavymetalsexposure',
       'occupationalexposurechemicals', 'waterqu

In [16]:
def inspect(data):

    """
    Inspecting the data checking for the shape, structure, missing value, duplicated rows and number of unique values in each of the fatures on the dataframe in the dataset
    """
    
    print("checking the columns in the dataset")
    print()
    print(data.columns)

    print()
    print('The first five(5) rows in the dataset')
    print(data.head(5))

    print()
    print("The Number of rows and columns in the dataset")
    print(data.shape)
    print()

    print('The structure and datatype of the dataset')
   
    print()
    print(data.info())
    print()

    for value in data.isna().sum():
        if value >= 1:
            print(f"There are {value} detected")
        else:
            print('There is no missing value detected')


    print()
    print(f"There are {data.duplicated().sum()} duplicated rows in the dataset")

    print()

inspect(df)
    



checking the columns in the dataset

Index(['patientid', 'age', 'gender', 'ethnicity', 'socioeconomicstatus',
       'educationlevel', 'bmi', 'smoking', 'alcoholconsumption',
       'physicalactivity', 'dietquality', 'sleepquality',
       'familyhistorykidneydisease', 'familyhistoryhypertension',
       'familyhistorydiabetes', 'previousacutekidneyinjury',
       'urinarytractinfections', 'systolicbp', 'diastolicbp',
       'fastingbloodsugar', 'hba1c', 'serumcreatinine', 'bunlevels', 'gfr',
       'proteininurine', 'acr', 'serumelectrolytessodium',
       'serumelectrolytespotassium', 'serumelectrolytescalcium',
       'serumelectrolytesphosphorus', 'hemoglobinlevels', 'cholesteroltotal',
       'cholesterolldl', 'cholesterolhdl', 'cholesteroltriglycerides',
       'aceinhibitors', 'diuretics', 'nsaidsuse', 'statins',
       'antidiabeticmedications', 'edema', 'fatiguelevels', 'nauseavomiting',
       'musclecramps', 'itching', 'qualityoflifescore', 'heavymetalsexposure',
       'occ

In [17]:
x = df.drop(['diagnosis','patientid'], axis = 1)
y =df['diagnosis']

In [19]:
y.value_counts()

diagnosis
1    1524
0     135
Name: count, dtype: int64

In [22]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
sampler1 = RandomUnderSampler()
sampler2 = RandomOverSampler()

resampled_x1, resampled_y1 = sampler1.fit_resample(x,y)
resampled_x2, resampled_y2 = sampler2.fit_resample(x,y)

print(resampled_y1.value_counts())
print(resampled_y2.value_counts())

diagnosis
0    135
1    135
Name: count, dtype: int64
diagnosis
1    1524
0    1524
Name: count, dtype: int64


In [23]:
def train(x,y):

    """
    Using a supervised approach to train and test the sets for modeling 
    """
    
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25, random_state=42)
    return x_train,x_test,y_train,y_test

x_train,x_test,y_train,y_test = train(resampled_x1, resampled_y1)


In [24]:
x_train.shape, y_train.shape

((202, 52), (202,))

In [25]:
x_test.shape, y_test.shape

((68, 52), (68,))

In [36]:
def process(data):

    """
    Processing and transforming the dataset for modeling 
    """
    
    from sklearn.preprocessing import StandardScaler, OrdinalEncoder
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.compose import ColumnTransformer

    num_cols = data.select_dtypes(include=["number"]).columns
    cat_cols = data.select_dtypes(include=["object", "category"]).columns
	

   
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('scaler', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    processor = ColumnTransformer(
        transformers=[
            ('num_pipe', num_pipe, num_cols),
            ('cat_pipe', cat_pipe, cat_cols)
        ],
        remainder = 'passthrough'
    )
    return processor

transformer = process(x_train)


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [37]:
rf_model = Pipeline([
    ("transformer", transformer),
    ("random_forest_model", RandomForestClassifier())
])

rf_model

In [40]:
rf_model.fit(x_train, y_train)

In [41]:
rf_prediction = rf_model.predict(x_test)

In [43]:
from sklearn.metrics import classification_report, confusion_matrix

In [44]:
report = classification_report(y_test, rf_prediction)
matrix_report = confusion_matrix(y_test, rf_prediction)

print(report)
print("-"*100)
print(matrix_report)

              precision    recall  f1-score   support

           0       0.79      0.66      0.72        35
           1       0.69      0.82      0.75        33

    accuracy                           0.74        68
   macro avg       0.74      0.74      0.73        68
weighted avg       0.74      0.74      0.73        68

----------------------------------------------------------------------------------------------------
[[23 12]
 [ 6 27]]


In [None]:
rf_model.fit(x_train, y_train)