In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Load the dataset
data = pd.read_csv('imputed_data.csv')

# Explore the data
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Columns: 1144 entries, PFS_I_EVENT to wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis
dtypes: float64(1141), int64(3)
memory usage: 1.9 MB
None


In [5]:
# Function to identify categorical variables with values 0 or 1
def find_binary_categorical_vars(df):
    binary_categorical_vars = []
    for col in df.columns:
        if df[col].dropna().isin([0, 1]).all():
            binary_categorical_vars.append(col)
    return binary_categorical_vars

# Identify binary categorical variables
binary_categorical_vars = find_binary_categorical_vars(data)

print("Categorical variables with values 0 or 1 are:", binary_categorical_vars)

Categorical variables with values 0 or 1 are: ['PFS_I_EVENT', 'OS_EVENT', 'SEX']


Diagnosi di mieloma:
CREATININA > 2 
Emoglobina < 10.5
Plt (piastrine) <130 
PC_TOT (plasma cellule) >5/10%, nei casi avanzati >60%
Calcio >11

In [7]:
data['CREATININE'] = pd.to_numeric(data['CREATININE'], errors='coerce')

# Count values higher than 50
higher_than_2 = (data['CREATININE'] > 2).sum()

# Count values lower than 50
lower_than_2 = (data['CREATININE'] < 2).sum()

# Count NaN values
nan_values = data['CREATININE'].isna().sum()

mean=data['CREATININE'].mean()
mass=data['CREATININE'].max()
min=data['CREATININE'].min()

print(f"Values higher than 2: {higher_than_2}")
print(f"Values lower than 2: {lower_than_2}")
print(f"NaN values: {nan_values}")
print(f"Mean values: {mean}")
print(f"Max values: {mass}")
print(f"Min values: {min}")

Values higher than 2: 24
Values lower than 2: 197
NaN values: 0
Mean values: 1.471899046863711
Max values: 17.2
Min values: 0.47


In [19]:
data['PLT'] = pd.to_numeric(data['PLT'], errors='coerce')

# Count values higher than 50
higher_than = (data['PLT'] > 130).sum()

# Count values lower than 50
lower_than = (data['PLT'] < 130).sum()

# Count NaN values
nan_values = data['PLT'].isna().sum()

mean=data['PLT'].mean()
mass=data['PLT'].max()
min=data['PLT'].min()

print(f"Values higher than 150: {higher_than}")
print(f"Values lower than 150: {lower_than}")
print(f"NaN values: {nan_values}")
print(f"Mean values: {mean}")
print(f"Max values: {mass}")
print(f"Min values: {min}")

Values higher than 150: 199
Values lower than 150: 20
NaN values: 0
Mean values: 228.11433887183045
Max values: 536.0
Min values: 35.0


In [14]:
data['HB'] = pd.to_numeric(data['HB'], errors='coerce')

# Count values higher than 50
higher_than = (data['HB'] > 10).sum()

# Count values lower than 50
lower_than = (data['HB'] < 10).sum()

# Count NaN values
nan_values = data['HB'].isna().sum()

mean=data['HB'].mean()
mass=data['HB'].max()
min=data['HB'].min()

print(f"Values higher than 10: {higher_than}")
print(f"Values lower than 10: {lower_than}")
print(f"NaN values: {nan_values}")
print(f"Mean values: {mean}")
print(f"Max values: {mass}")
print(f"Min values: {min}")

Values higher than 10: 148
Values lower than 10: 71
NaN values: 0
Mean values: 11.042403265402262
Max values: 15.7
Min values: 6.6


In [18]:
data['PC_TOT'] = pd.to_numeric(data['PC_TOT'], errors='coerce')

# Count values higher than 50
higher_than = (data['PC_TOT'] >=60).sum()

# Count values lower than 50
lower_than = (data['PC_TOT'] < 60).sum()

# Count NaN values
nan_values = data['PC_TOT'].isna().sum()

mean=data['PC_TOT'].mean()
mass=data['PC_TOT'].max()
min=data['PC_TOT'].min()

print(f"Values higher than 60: {higher_than}")
print(f"Values lower than 60: {lower_than}")
print(f"NaN values: {nan_values}")
print(f"Mean values: {mean}")
print(f"Max values: {mass}")
print(f"Min values: {min}")

Values higher than 60: 81
Values lower than 60: 140
NaN values: 0
Mean values: 49.737244412442166
Max values: 100.0
Min values: 5.0


In [16]:
data['CALCIUM'] = pd.to_numeric(data['CALCIUM'], errors='coerce')

# Count values higher than 50
higher_than = (data['CALCIUM'] > 11).sum()

# Count values lower than 50
lower_than = (data['CALCIUM'] < 11).sum()

# Count NaN values
nan_values = data['CALCIUM'].isna().sum()

mean=data['CALCIUM'].mean()
mass=data['CALCIUM'].max()
min=data['CALCIUM'].min()

print(f"Values higher than 11: {higher_than}")
print(f"Values lower than 11: {lower_than}")
print(f"NaN values: {nan_values}")
print(f"Mean values: {mean}")
print(f"Max values: {mass}")
print(f"Min values: {min}")

Values higher than 11: 16
Values lower than 11: 203
NaN values: 0
Mean values: 9.714010592138946
Max values: 16.1
Min values: 7.5


In [20]:
data['CREATININE'] = (data['CREATININE'] > 2).astype(int)
data['HB'] = (data['HB'] < 10.5).astype(int)
data['PLT'] = (data['PLT'] < 130).astype(int)
data['CALCIUM'] = (data['CALCIUM'] > 11).astype(int)
data['PC_TOT'] = (data['PC_TOT'] > 60).astype(int)

Tutti i valori maggiori di 2 diventino 1
Tutti i valori minori o uguali a 2 diventino 0
Quindi 1 diagnosi di mieloma, 0 non diagnosi 

In [21]:
print(data['CREATININE'])

0      1
1      0
2      0
3      0
4      0
      ..
216    0
217    1
218    0
219    0
220    0
Name: CREATININE, Length: 221, dtype: int32


In [24]:
data.to_csv('binarized_data.csv', index=False)

In [25]:
# Load the dataset
data_1 = pd.read_csv('binarized_data.csv')