In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder





# Identify numerical and categorical columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns

# Handle missing values for numeric columns
num_imputer = SimpleImputer(strategy='mean')
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

# Handle missing values for categorical columns (if any)
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

# Label Encoding for categorical columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])  # Assumes test data has no unseen categories
    label_encoders[col] = le

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Final preprocessed outputs
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

ValueError: at least one array or dtype is required

In [15]:
# Load the datasets
train_df = pd.read_csv('Paitients_Files_Train.csv')
test_df = pd.read_csv('Paitients_Files_Test.csv')





Test Data Column Types:

ID            object
PRG            int64
PL             int64
PR             int64
SK             int64
TS             int64
M11          float64
BD2          float64
Age            int64
Insurance      int64
dtype: object


In [9]:
# Drop ID column from both
train_df = train_df.drop(columns=['ID'])
test_df = test_df.drop(columns=['ID'])

# Separate features and target (assuming 'SepsisLabel' is the target)
X_train = train_df.drop(columns=['Sepssis'])
y_train = train_df['Sepssis']

# Save test features (assuming test set does not have labels)
X_test = test_df.copy()

In [19]:
print("Training Data Column Types:\n")
print(train_df.dtypes)

print("\n" + "="*50 + "\n")

# Display datatypes of all columns in the test dataset
print("Test Data Column Types:\n")
print(test_df.dtypes)

Training Data Column Types:

ID            object
PRG            int64
PL             int64
PR             int64
SK             int64
TS             int64
M11          float64
BD2          float64
Age            int64
Insurance      int64
Sepssis       object
dtype: object


Test Data Column Types:

ID            object
PRG            int64
PL             int64
PR             int64
SK             int64
TS             int64
M11          float64
BD2          float64
Age            int64
Insurance      int64
dtype: object


In [13]:
print("Missing values in Train Data:\n")
print(train_df.isnull().sum())

print("\n" + "="*50 + "\n")

# Count missing values in each column for test data
print("Missing values in Test Data:\n")
print(test_df.isnull().sum())


Missing values in Train Data:

PRG          0
PL           0
PR           0
SK           0
TS           0
M11          0
BD2          0
Age          0
Insurance    0
Sepssis      0
dtype: int64


Missing values in Test Data:

PRG          0
PL           0
PR           0
SK           0
TS           0
M11          0
BD2          0
Age          0
Insurance    0
dtype: int64


In [25]:
# Display unique values before mapping
print("Before Mapping:", train_df['Sepssis'].unique())

# Map 'positive' -> 1, 'negative' -> 0
train_df['Sepssis'] = train_df['Sepssis'].map({'Positive': 1, 'Negative': 0})

# Display unique values after mapping
print("After Mapping:", train_df['Sepssis'].unique())

Before Mapping: ['Positive' 'Negative']
After Mapping: ['Positive' 'Negative']
