In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Step 1: Load the dataset
# Make sure 'Training.csv' is in the same directory as your Jupyter Notebook,
# or provide the full path to the file.
df = pd.read_csv('Training.csv')

print("--- Original DataFrame Head ---")
print(df.head())

print("\n--- Original DataFrame Info ---")
df.info()

print("\n--- Missing Values Before Preprocessing ---")
missing_values_count = df.isnull().sum()
# Display only columns that have missing values
print(missing_values_count[missing_values_count > 0])

--- Original DataFrame Head ---
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   ski

In [2]:
# Step 2: Data Cleaning (Handling Missing Values and Unnecessary Columns)

# Identify columns with all missing values and drop them
# The 'Unnamed: 133' column is often an artifact from CSV export and contains only NaN values.
cols_to_drop = missing_values_count[missing_values_count == len(df)].index.tolist()
if cols_to_drop:
    print(f"\nDropping columns with all missing values: {cols_to_drop}")
    df = df.drop(columns=cols_to_drop)
else:
    print("\nNo columns with all missing values to drop.")


Dropping columns with all missing values: ['Unnamed: 133']


In [3]:
# Step 3: Separate Features (X) and Target (y)
# We assume 'prognosis' is your target variable (the outcome you want to predict).
# If your target column has a different name, replace 'prognosis' with its actual name.
if 'prognosis' in df.columns:
    X = df.drop('prognosis', axis=1)  # Features
    y = df['prognosis']               # Target
    print("\n'prognosis' column identified as target variable (y).")
else:
    print("\nError: 'prognosis' column not found. Please verify the target column name.")
    print("Available columns:", df.columns.tolist())
    # You might want to handle this error more gracefully, e.g., by asking the user
    # to input the correct target column name or by exiting.
    exit() # Exiting for demonstration if target not found.

print("\n--- Features (X) Info After Separating Target ---")
X.info()

# Identify numerical and categorical columns within features (X)
# In this dataset, most symptom columns are already numerical (0 or 1).
numerical_cols_X = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_X = X.select_dtypes(include=['object', 'bool']).columns

print(f"\nFeatures (X) - Numerical columns: {list(numerical_cols_X)}")
print(f"Features (X) - Categorical columns: {list(categorical_cols_X)}")

# Impute any remaining missing values in X (if any exist after dropping empty columns)
# For numerical columns, mean imputation is a common strategy.
numerical_cols_X_with_missing = X[numerical_cols_X].columns[X[numerical_cols_X].isnull().any()].tolist()
if numerical_cols_X_with_missing:
    numerical_imputer = SimpleImputer(strategy='mean')
    X[numerical_cols_X_with_missing] = numerical_imputer.fit_transform(X[numerical_cols_X_with_missing])
    print(f"Imputed numerical columns in X (mean strategy): {numerical_cols_X_with_missing}")
else:
    print("No missing values in numerical columns of X to impute.")

# For categorical columns, mode (most frequent) imputation is common.
categorical_cols_X_with_missing = X[categorical_cols_X].columns[X[categorical_cols_X].isnull().any()].tolist()
if categorical_cols_X_with_missing:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    X[categorical_cols_X_with_missing] = categorical_imputer.fit_transform(X[categorical_cols_X_with_missing])
    print(f"Imputed categorical columns in X (mode strategy): {categorical_cols_X_with_missing}")
else:
    print("No missing values in categorical columns of X to impute.")

print("\n--- Missing Values in X After Imputation ---")
# This should ideally show an empty Series if all missing values are handled
print(X.isnull().sum()[X.isnull().sum() > 0])


'prognosis' column identified as target variable (y).

--- Features (X) Info After Separating Target ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 132 entries, itching to yellow_crust_ooze
dtypes: int64(132)
memory usage: 5.0 MB

Features (X) - Numerical columns: ['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing', 'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity', 'ulcers_on_tongue', 'muscle_wasting', 'vomiting', 'burning_micturition', 'spotting_ urination', 'fatigue', 'weight_gain', 'anxiety', 'cold_hands_and_feets', 'mood_swings', 'weight_loss', 'restlessness', 'lethargy', 'patches_in_throat', 'irregular_sugar_level', 'cough', 'high_fever', 'sunken_eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish_skin', 'dark_urine', 'nausea', 'loss_of_appetite', 'pain_behind_the_eyes', 'back_pain', 'constipation', 'abdominal_pain', 'diarrhoea', 'mild_fever', 'yellow_urine', 'yellowing_

In [4]:
# Step 4: Data Normalization/Scaling (for numerical features)
# StandardScaler is a good choice for features that are not naturally on the same scale.
# It transforms data to have a mean of 0 and a standard deviation of 1.
if not numerical_cols_X.empty:
    scaler = StandardScaler()
    X[numerical_cols_X] = scaler.fit_transform(X[numerical_cols_X])
    print("\nNumerical columns in X scaled using StandardScaler.")
else:
    print("\nNo numerical columns in X to scale.")


Numerical columns in X scaled using StandardScaler.


In [5]:
# Step 5: Encoding Categorical Features (for categorical features in X)
# One-Hot Encoding converts categorical variables into a format that can be provided to ML algorithms.
# In this specific dataset, most features are already binary (0/1), so this step might not perform extensive encoding.
if not categorical_cols_X.empty:
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_features = encoder.fit_transform(X[categorical_cols_X])
    encoded_feature_names = encoder.get_feature_names_out(categorical_cols_X)
    encoded_X_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=X.index)

    # Drop the original categorical columns and concatenate the new one-hot encoded columns
    X = X.drop(columns=categorical_cols_X)
    X = pd.concat([X, encoded_X_df], axis=1)
    print("\nCategorical columns in X one-hot encoded.")
else:
    print("\nNo categorical columns in X to encode.")

print("\n--- Features (X) Head After All Preprocessing Steps ---")
print(X.head())
print("\n--- Features (X) Info After All Preprocessing Steps ---")
X.info()

print("\n--- Target (y) Head ---")
print(y.head())
print("\n--- Target (y) Value Counts (Distribution of Classes) ---")
print(y.value_counts())


No categorical columns in X to encode.

--- Features (X) Head After All Preprocessing Steps ---
    itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0  2.501327   2.293369              6.674995             -0.21738  -0.149813   
1 -0.399788   2.293369              6.674995             -0.21738  -0.149813   
2  2.501327  -0.436040              6.674995             -0.21738  -0.149813   
3  2.501327   2.293369             -0.149813             -0.21738  -0.149813   
4  2.501327   2.293369              6.674995             -0.21738  -0.149813   

     chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0 -0.439995   -0.401837      -0.21738 -0.21738         -0.149813  ...   
1 -0.439995   -0.401837      -0.21738 -0.21738         -0.149813  ...   
2 -0.439995   -0.401837      -0.21738 -0.21738         -0.149813  ...   
3 -0.439995   -0.401837      -0.21738 -0.21738         -0.149813  ...   
4 -0.439995   -0.401837      -0.21738 -0.21738         -0

In [6]:
# Step 6: Split the Data into Training and Testing Sets
# This is crucial for evaluating the performance of your machine learning model on unseen data.
# test_size=0.2 means 20% of the data will be used for testing, and 80% for training.
# random_state ensures reproducibility of your split.
# stratify=y ensures that the proportion of target classes is the same in both training and testing sets,
# which is important for imbalanced datasets in classification tasks.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\n--- Data Split Shapes ---")
print(f"Shape of X_train (training features): {X_train.shape}")
print(f"Shape of X_test (testing features): {X_test.shape}")
print(f"Shape of y_train (training target): {y_train.shape}")
print(f"Shape of y_test (testing target): {y_test.shape}")

print("\n--- Data Preprocessing Completed Successfully! ---")
print("\nYou now have your preprocessed data ready for building machine learning models:")
print("  - X_train: Training features")
print("  - X_test: Testing features")
print("  - y_train: Training target labels")
print("  - y_test: Testing target labels")


--- Data Split Shapes ---
Shape of X_train (training features): (3936, 132)
Shape of X_test (testing features): (984, 132)
Shape of y_train (training target): (3936,)
Shape of y_test (testing target): (984,)

--- Data Preprocessing Completed Successfully! ---

You now have your preprocessed data ready for building machine learning models:
  - X_train: Training features
  - X_test: Testing features
  - y_train: Training target labels
  - y_test: Testing target labels
