In [2]:
import pandas as pd # Library to load and manipulate data and for One-Hot Encoding
import numpy as np # Library to calculate the mean and standard deviation
from sklearn import preprocessing # Library for data preprocessing
from sklearn.model_selection import train_test_split # Module for splitting data into training and testing sets
from sklearn.tree import DecisionTreeClassifier # Module for a classification tree
from sklearn.tree import plot_tree # Module to plot decision tree
from sklearn.metrics import accuracy_score, confusion_matrix # Modules for evaluating the model
import graphviz # Library for rendering DOT format data

In [3]:
# load dataset
df = pd.read_csv('data/DIAGNOSIS_RECORDv2.csv')
df.head()

Unnamed: 0,RecordNumber,Age,Gender,BMI,Asymptomatic,Increased thirst,Polydipsia,Polyuria,lethargy,Weight loss,...,Smokers,Not Balanced diet,First degree relative with diabetes,Dyslipidaemia,Hypertension,Pancreatic damage or surgery,"genetic, haematologic and illness-related factors",Anaemia,CKD,Diagnosis
0,1,30,0,26,1.0,0.0,0.0,0.0,0.0,0.0,...,1,1.0,1.0,1.0,0,0.0,1.0,0.0,1.0,2
1,2,39,0,36,1.0,0.0,0.0,0.0,0.0,0.0,...,1,0.0,0.0,0.0,0,1.0,1.0,1.0,1.0,2
2,20,38,1,40,0.0,1.0,0.0,0.0,0.0,1.0,...,0,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,2
3,22,37,1,22,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,2
4,23,44,0,34,0.0,0.0,1.0,1.0,0.0,1.0,...,1,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,2


In [4]:
# Count the remaining data points
data_count = df.shape[0]

print("Dataset size:")
print(data_count)

Dataset size:
4056


In [9]:
# Separate the data into features (X) and target variable (y)
X = df.drop('Diagnosis', axis=1)  # Assuming 'diagnosis' is your target column
y = df['Diagnosis']

# Print class distribution before balancing
print("Class distribution before balancing:")
print(y.value_counts())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Manually balance the data by undersampling majority classes
undersampled_df = pd.concat([
    df[df['Diagnosis'] == 1].sample(n=528, random_state=42),
    df[df['Diagnosis'] == 2].sample(n=528, random_state=42),
    df[df['Diagnosis'] == 3].sample(n=312, random_state=42),
    df[df['Diagnosis'] == 4].sample(n=528, random_state=42)
])

# Separate the undersampled data into features (X_under) and target variable (y_under)
X_under = undersampled_df.drop('Diagnosis', axis=1)
y_under = undersampled_df['Diagnosis']

# Print class distribution after balancing
print("\nClass distribution after balancing:")
print(y_under.value_counts())

# Save the balanced dataset to a new CSV file
undersampled_df.to_csv('data/DIAGNOSIS_RECORDv3.csv', index=False)

Class distribution before balancing:
Diagnosis
1    1776
4    1440
2     528
3     312
Name: count, dtype: int64

Class distribution after balancing:
Diagnosis
1    528
2    528
4    528
3    312
Name: count, dtype: int64
