Načtení dat

In [2]:
import pandas as pd
data = pd.read_csv("diabetes_prediction_dataset.csv")
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


# Rozdělení na training a testing set

In [3]:
from sklearn.model_selection import train_test_split
X = data.drop('diabetes', axis=1)
y = data['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set: ", X_train.shape)
print("Testing set: ", X_test.shape)

Training set:  (80000, 8)
Testing set:  (20000, 8)


# Doplnění chybějících pozorování pro smoking_history

In [4]:
print(X_train['smoking_history'].value_counts())
print("celkem: " ,X_train['smoking_history'].shape[0])

smoking_history
No Info        28667
never          28065
former          7510
current         7383
not current     5173
ever            3202
Name: count, dtype: int64
celkem:  80000


In [5]:
import numpy as np
value_counts = X_train[X_train['smoking_history'] != 'No Info']['smoking_history'].value_counts()
total_count = value_counts.sum()
distribution = (X_train['smoking_history'].value_counts() / total_count).drop('No Info')
random= np.random.choice(distribution.index,X_train[X_train['smoking_history'] == 'No Info'].shape[0] ,p=distribution.values)

X_train['smoking_history'].loc[X_train['smoking_history'] == 'No Info'] = random


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  X_train['smoking_history'].loc[X_train['smoking_history'] == 'No Info'] = random
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

In [6]:
print(X_train['smoking_history'].value_counts())
print("celkem: " ,X_train['smoking_history'].shape[0])

smoking_history
never          43735
former         11635
current        11580
not current     8116
ever            4934
Name: count, dtype: int64
celkem:  80000


In [7]:
#Jednodušší doplnění módusem
#X_train['smoking_history'] = X_train['smoking_history'].replace('No Info', str(X_train['smoking_history'].mode()[0]))
#X_train['smoking_history']

# Upsampling

In [8]:
#upsample the testing data to contain more samples with diabetes = 1
from sklearn.utils import resample
X_train['diabetes'] = y_train
#Separate the features and target variable
X,y= X_train.drop('diabetes', axis=1), X_train['diabetes']
#Resampling the minority class (diabetes = 1) by up-sampling
X_up, y_up = resample(X[y==1], y[y==1], replace=True, n_samples=X[y==0].shape[0], random_state=42)
#Combining the upsampled data with the original data of the majority class
X_resampled = pd.concat([X[y==0], X_up])
y_resampled = pd.Series(np.concatenate([y[y==0], y_up]))
print("Počet kde diabetes = 1 po upsamplingu: ", y_resampled.sum()) 

X_train = X_resampled
y_train = y_resampled

Počet kde diabetes = 1 po upsamplingu:  73208


# Odstranění dat

Odstranění záznamů, kde je pohlaví určeno jako Other. Dataset obsahuje těchto instancí jen 16.

In [9]:
print ("Počet pro pohláví 'Other'",X_train[X_train['gender'] == 'Other'].shape[0])
X_train = X_train.drop(X_train[X_train['gender'] == 'Other'].index)

Počet pro pohláví 'Other' 16


# Převedení kategoriálních hodnot na binární

## Gender na binarní reprezentaci

In [10]:
X_train = pd.get_dummies(X_train, columns=['gender'])

X_train

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male
75220,73.0,0,0,former,24.77,3.5,80,False,True
44966,38.0,0,0,ever,24.33,4.0,158,True,False
13568,26.0,0,0,not current,18.87,5.0,100,True,False
92727,61.0,1,0,current,22.11,4.5,85,True,False
51349,34.0,0,0,never,19.46,5.7,126,True,False
...,...,...,...,...,...,...,...,...,...
92511,28.0,0,0,ever,40.10,7.5,200,True,False
45312,49.0,1,0,never,25.99,5.7,159,False,True
70088,65.0,0,1,former,28.72,6.6,300,False,True
38192,69.0,0,1,not current,31.32,9.0,160,False,True


## Smoking_history na binární reprezentaci

In [11]:
X_train = pd.get_dummies(X_train, columns=['smoking_history'])

X_train

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
75220,73.0,0,0,24.77,3.5,80,False,True,False,False,True,False,False
44966,38.0,0,0,24.33,4.0,158,True,False,False,True,False,False,False
13568,26.0,0,0,18.87,5.0,100,True,False,False,False,False,False,True
92727,61.0,1,0,22.11,4.5,85,True,False,True,False,False,False,False
51349,34.0,0,0,19.46,5.7,126,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92511,28.0,0,0,40.10,7.5,200,True,False,False,True,False,False,False
45312,49.0,1,0,25.99,5.7,159,False,True,False,False,False,True,False
70088,65.0,0,1,28.72,6.6,300,False,True,False,False,True,False,False
38192,69.0,0,1,31.32,9.0,160,False,True,False,False,False,False,True


# Normalizace dat

In [12]:
X_train.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
count,146400.0,146400.0,146400.0,146400.0,146400.0,146400.0
mean,50.643077,0.152985,0.090806,29.432107,6.16871,163.584652
std,21.531668,0.359974,0.287334,7.432243,1.287692,57.088608
min,0.08,0.0,0.0,10.01,3.5,80.0
25%,36.0,0.0,0.0,25.79,5.7,130.0
50%,54.0,0.0,0.0,27.32,6.1,155.0
75%,68.0,0.0,0.0,32.81,6.6,200.0
max,80.0,1.0,1.0,91.82,9.0,300.0


In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']] = scaler.fit_transform(X_train[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']])
X_train.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
count,146400.0,146400.0,146400.0,146400.0,146400.0,146400.0
mean,0.632671,0.152985,0.090806,0.237405,0.48522,0.37993
std,0.269415,0.359974,0.287334,0.090848,0.234126,0.259494
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.449449,0.0,0.0,0.192886,0.4,0.227273
50%,0.674675,0.0,0.0,0.211588,0.472727,0.340909
75%,0.84985,0.0,0.0,0.278695,0.563636,0.545455
max,1.0,1.0,1.0,1.0,1.0,1.0


# Preprocesing test setu

In [14]:
value_counts = X_test[X_test['smoking_history'] != 'No Info']['smoking_history'].value_counts()
total_count = value_counts.sum()
distribution = (X_test['smoking_history'].value_counts() / total_count).drop('No Info')
random= np.random.choice(distribution.index,X_test[X_test['smoking_history'] == 'No Info'].shape[0] ,p=distribution.values)

X_test['smoking_history'].loc[X_test['smoking_history'] == 'No Info'] = random


X_test = pd.get_dummies(X_test, columns=['smoking_history'])
X_test = X_test.drop(X_test[X_test['gender'] == 'Other'].index)
X_test = pd.get_dummies(X_test, columns=['gender'])
X_test[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']] = scaler.fit_transform(X_test[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  X_test['smoking_history'].loc[X_test['smoking_history'] == 'No Info'] = random
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [15]:
X_test.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
count,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0
mean,0.518259,0.071207,0.037954,0.200084,0.369493,0.264014
std,0.28267,0.257177,0.191089,0.078621,0.194602,0.184586
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.286787,0.0,0.0,0.155906,0.236364,0.090909
50%,0.524525,0.0,0.0,0.200351,0.418182,0.272727
75%,0.737237,0.0,0.0,0.226082,0.490909,0.359091
max,1.0,1.0,1.0,1.0,1.0,1.0


## Export datasetů pro pozdější práci

In [16]:
X_train.to_csv("X_trainSuper.csv", index=False)
X_test.to_csv("X_testSuper.csv", index=False)
y_train.to_csv("y_trainSuper.csv", index=False)
y_test.to_csv("y_testSuper.csv", index=False)