In [None]:
import numpy as np
import pandas as pd

In [None]:
np.random.seed(42)


In [None]:
n_samples = 1000
age = np.random.randint(18, 90, n_samples)
weight = np.random.uniform(50, 120, n_samples)  # Use uniform distribution for float values
blood_pressure = np.random.uniform(80, 180, n_samples)  # Use uniform distribution for float values
cholesterol = np.random.uniform(150, 300, n_samples)  # Use uniform distribution for float values
condition = np.random.choice([0, 1], n_samples)

In [None]:
weight[np.random.choice(n_samples, 50, replace=False)] = np.nan
cholesterol[np.random.choice(n_samples, 30, replace=False)] = np.nan


In [None]:
data = pd.DataFrame({
    'Age': age,
    'Weight': weight,
    'BloodPressure': blood_pressure,
    'Cholesterol': cholesterol,
    'Condition': condition
})


In [None]:
print(data.head())
print("Missing values in each column:\n", data.isnull().sum())


   Age      Weight  BloodPressure  Cholesterol  Condition
0   86  112.078863     120.738228   252.409095          0
1   75  112.219857      97.740947   255.113800          0
2   61  102.201361      90.963628   259.000570          1
3   73   52.460989     136.703278   291.390405          1
4   60   90.254929     172.833577   274.828673          1
Missing values in each column:
 Age               0
Weight           50
BloodPressure     0
Cholesterol      30
Condition         0
dtype: int64


In [None]:
data['Weight'].fillna(data['Weight'].mean(), inplace=True)
data['Cholesterol'].fillna(data['Cholesterol'].mean(), inplace=True)

In [None]:
print("Missing values after imputation:\n", data.isnull().sum())


Missing values after imputation:
 Age              0
Weight           0
BloodPressure    0
Cholesterol      0
Condition        0
dtype: int64


In [None]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [None]:
data = remove_outliers(data, 'BloodPressure')
data = remove_outliers(data, 'Cholesterol')

In [None]:
print("Data shape after outlier removal:", data.shape)


Data shape after outlier removal: (1000, 5)


In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
features = ['Age', 'Weight', 'BloodPressure', 'Cholesterol']
target = 'Condition'

In [None]:
scaler = StandardScaler()


In [None]:
data[features] = scaler.fit_transform(data[features])


In [None]:
print(data.head())


        Age    Weight  BloodPressure  Cholesterol  Condition
0  1.602997  1.339397      -0.346788     0.683754          0
1  1.069408  1.346518      -1.130588     0.748416          0
2  0.390296  0.840575      -1.361574     0.841336          1
3  0.972392 -1.671354       0.197338     1.615676          1
4  0.341788  0.237271       1.428741     1.219737          1


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X = data[features]
y = data[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (800, 4)
Testing set shape: (200, 4)
