In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multiclassificationtask/sample_submission.csv
/kaggle/input/multiclassificationtask/train.csv
/kaggle/input/multiclassificationtask/test.csv


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Dataset

In [4]:
sample_submission_path = "/kaggle/input/multiclassificationtask/sample_submission.csv"
test_path = "/kaggle/input/multiclassificationtask/test.csv"
train_path = "/kaggle/input/multiclassificationtask/train.csv"

sample_submission_df = pd.read_csv(sample_submission_path)
test_df = pd.read_csv(test_path)
train_df = pd.read_csv(train_path)


print("Train data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("Sample Submission shape:", sample_submission_df.shape)

Train data shape: (15000, 20)
Test data shape: (10000, 19)
Sample Submission shape: (10000, 4)


# Data Analysis

In [5]:
# Display the first few rows of the training data
print("Train DataFrame Head:")
print(train_df.head())

# Display information about the DataFrame (data types, non-null counts)
print("\nTrain DataFrame Info:")
train_df.info()

# Display descriptive statistics of numerical columns
print("\nTrain DataFrame Description:")
print(train_df.describe())

# Check the distribution of the target variable
print("\nDistribution of 'Status' in Train DataFrame:")
print(train_df['Status'].value_counts())
print("\nProportion of 'Status' in Train DataFrame:")
print(train_df['Status'].value_counts(normalize=True))

Train DataFrame Head:
   id  N_Days             Drug      Age Sex Ascites Hepatomegaly Spiders  \
0   0  2178.0  D-penicillamine  16374.0   F       N            N       N   
1   1  2644.0  D-penicillamine  17774.0   F       N            N       N   
2   2  3069.0          Placebo  17844.0   F       N            N       N   
3   3  2216.0          Placebo  19221.0   F       N            Y       Y   
4   4  2256.0          Placebo  21600.0   F       N            N       N   

  Edema  Bilirubin  Cholesterol  Albumin  Copper  Alk_Phos    SGOT  \
0     N        0.5        263.0     3.20    43.0    1110.0  106.95   
1     N        0.8        280.0     3.60    22.0     678.0   62.00   
2     N        1.1        408.0     4.40    54.0    2108.0  142.60   
3     N        0.8        252.0     3.70    36.0     843.0   55.80   
4     N        4.7        348.0     3.06   464.0     961.0  120.90   

   Tryglicerides  Platelets  Prothrombin  Stage Status  
0           67.0      430.0          9.6   

In [6]:
# Drop the row where 'Status' is 'Y'
train_df = train_df[train_df['Status'] != 'Y'].copy()

# Drop the 'id' column from the training data
train_df = train_df.drop('id', axis=1)

# Confirm the new shape and status distribution
print("Train DataFrame Shape after dropping 'Y' status and 'id' column:", train_df.shape)
print("\nNew Distribution of 'Status' in Train DataFrame:")
print(train_df['Status'].value_counts())
print("\nNew Proportion of 'Status' in Train DataFrame:")
print(train_df['Status'].value_counts(normalize=True))

Train DataFrame Shape after dropping 'Y' status and 'id' column: (14999, 19)

New Distribution of 'Status' in Train DataFrame:
Status
C     10053
D      4565
CL      381
Name: count, dtype: int64

New Proportion of 'Status' in Train DataFrame:
Status
C     0.670245
D     0.304354
CL    0.025402
Name: proportion, dtype: float64


# Handling Missing Values

In [7]:
# Identify categorical and numerical columns with missing values (excluding 'id' and 'Status')
# Re-identifying all column types after dropping 'id' and 'Y' status
categorical_cols = train_df.select_dtypes(include='object').columns.tolist()
# Remove 'Status' from categorical_cols as it's the target
if 'Status' in categorical_cols:
    categorical_cols.remove('Status')

numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical columns to impute:", [col for col in categorical_cols if train_df[col].isnull().any()])
print("Numerical columns to impute:", [col for col in numerical_cols if train_df[col].isnull().any()])

# Impute categorical columns with 'Missing' category
for col in ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']: # These were identified with many missing values
    if col in categorical_cols:
        train_df[col].fillna('Missing', inplace=True)
        # Apply the same to test_df if it also has these columns (important for consistency)
        if col in test_df.columns:
            test_df[col].fillna('Missing', inplace=True)


# Impute numerical columns with the median
for col in ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']:
    if col in numerical_cols:
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        # Apply the same to test_df using the median calculated from train_df
        if col in test_df.columns:
            test_df[col].fillna(median_val, inplace=True)

# Verify that all missing values have been handled
print("\nMissing values after imputation (Train DataFrame):")
print(train_df.isnull().sum())

# Also check for test_df
print("\nMissing values after imputation (Test DataFrame):")
print(test_df.isnull().sum())

Categorical columns to impute: ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']
Numerical columns to impute: ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']

Missing values after imputation (Train DataFrame):
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64

Missing values after imputation (Test DataFrame):
id               0
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Pr

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna('Missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna('Missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

# Feature Engineering and Transformation

In [8]:
# Convert 'Age' from days to years in both train and test data
train_df['Age'] = train_df['Age'] / 365.25
test_df['Age'] = test_df['Age'] / 365.25

print("\nAge column head after conversion (Train DataFrame):")
print(train_df['Age'].head())
print("\nAge column head after conversion (Test DataFrame):")
print(test_df['Age'].head())

# Store 'id' column from test_df for submission
test_id = test_df['id']

# Drop 'id' column from test_df as it's not a feature for modeling
test_df = test_df.drop('id', axis=1)

print("\nTest DataFrame Shape after dropping 'id' column:", test_df.shape)


# Identify categorical columns for one-hot encoding (excluding 'Status')
# Ensure 'Status' is not in this list, as it's our target.
categorical_features = train_df.select_dtypes(include='object').columns.tolist()
if 'Status' in categorical_features:
    categorical_features.remove('Status')

print("\nCategorical features to encode:", categorical_features)

# Apply One-Hot Encoding to categorical features in both train and test data
# Use pd.get_dummies to convert categorical variables into dummy/indicator variables.
# handle_unknown='ignore' is important for test data to prevent errors if a category appears in test but not train.
train_df = pd.get_dummies(train_df, columns=categorical_features, drop_first=True) # drop_first avoids multicollinearity
test_df = pd.get_dummies(test_df, columns=categorical_features, drop_first=True)

# Align columns - crucial after one-hot encoding, especially if test_df has different categories
# or missing 'Missing' categories that were present in train_df
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)

missing_in_test = list(train_cols - test_cols)
if 'Status' in missing_in_test:
    missing_in_test.remove('Status') # Don't add Status to test_df

for col in missing_in_test:
    if col != 'Status': # Ensure we don't add the target column to test_df
        test_df[col] = 0

missing_in_train = list(test_cols - train_cols)
for col in missing_in_train:
    train_df[col] = 0

# Ensure the order of columns is the same, except for the 'Status' column in train_df
common_cols = list(set(train_df.columns) & set(test_df.columns))
train_df = train_df[common_cols + ['Status']] # Keep Status at the end for clarity
test_df = test_df[common_cols]

print("\nTrain DataFrame head after One-Hot Encoding:")
print(train_df.head())
print("\nTest DataFrame head after One-Hot Encoding:")
print(test_df.head())
print("\nTrain DataFrame shape after One-Hot Encoding:", train_df.shape)
print("Test DataFrame shape after One-Hot Encoding:", test_df.shape)


Age column head after conversion (Train DataFrame):
0    44.829569
1    48.662560
2    48.854209
3    52.624230
4    59.137577
Name: Age, dtype: float64

Age column head after conversion (Test DataFrame):
0    62.001369
1    52.000000
2    43.942505
3    55.726215
4    64.000000
Name: Age, dtype: float64

Test DataFrame Shape after dropping 'id' column: (10000, 18)

Categorical features to encode: ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

Train DataFrame head after One-Hot Encoding:
   Cholesterol  Stage  Edema_Y    SGOT  Spiders_Y  Tryglicerides  Spiders_N  \
0        263.0    3.0    False  106.95      False           67.0       True   
1        280.0    3.0    False   62.00      False           80.0       True   
2        408.0    3.0    False  142.60      False          137.0       True   
3        252.0    4.0    False   55.80       True           56.0      False   
4        348.0    2.0    False  120.90      False          146.0       True   

   Drug_Placeb

# Model Training and Evaluation 

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
import lightgbm as lgb
import numpy as np

# Separate features (X) and target (y)
X = train_df.drop('Status', axis=1)
y = train_df['Status']

# Encode the target variable
# LightGBM can handle categorical targets, but it's often better to encode them numerically for consistency
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Print the mapping of original status to encoded integers
print("\nStatus Label Encoding Mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label}: {i}")

# Scale numerical features (important for many models, though LightGBM is less sensitive)
# Identify numerical columns in X
numerical_features_for_scaling = X.select_dtypes(include=['float64', 'int64', 'uint8']).columns.tolist() # uint8 for dummy variables

# Remove one-hot encoded columns (which are binary and don't need scaling in this context)
# We can identify them by checking if they contain only 0s and 1s
binary_cols = [col for col in numerical_features_for_scaling if X[col].nunique() == 2 and set(X[col].unique()).issubset({0, 1})]
numerical_features_to_scale = [col for col in numerical_features_for_scaling if col not in binary_cols]


scaler = StandardScaler()
X[numerical_features_to_scale] = scaler.fit_transform(X[numerical_features_to_scale])
test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])

print("\nFeatures after scaling (Train DataFrame head):")
print(X.head())
print("\nFeatures after scaling (Test DataFrame head):")
print(test_df.head())


# Model Training with Stratified K-Fold Cross-Validation
# StratifiedKFold ensures that each fold has roughly the same proportion of target labels as the whole dataset.
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros((len(X), len(label_encoder.classes_)))
sub_preds = np.zeros((len(test_df), len(label_encoder.classes_)))

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y_encoded)):
    X_train, y_train = X.iloc[train_idx], y_encoded[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y_encoded[valid_idx]

    lgb_params = {
        'objective': 'multiclass',
        'num_class': len(label_encoder.classes_),
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'n_estimators': 1000, # Increased n_estimators, will use early stopping
        'learning_rate': 0.03,
        'num_leaves': 20,
        'max_depth': 5,
        'seed': 42,
        'n_jobs': -1,
        'verbose': -1,
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': 42,
    }

    model = lgb.LGBMClassifier(**lgb_params)

    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              callbacks=[lgb.early_stopping(100, verbose=False)]) # Early stopping if validation metric doesn't improve for 100 rounds

    oof_preds[valid_idx] = model.predict_proba(X_valid)
    sub_preds += model.predict_proba(test_df) / folds.n_splits

print(f"\nOverall OOF LogLoss: {log_loss(y_encoded, oof_preds)}")

# Create the submission file
submission_df = pd.DataFrame({'id': test_id})
# Map encoded predictions back to original status labels for submission
submission_df['Status_C'] = sub_preds[:, label_encoder.transform(['C'])[0]]
submission_df['Status_CL'] = sub_preds[:, label_encoder.transform(['CL'])[0]]
submission_df['Status_D'] = sub_preds[:, label_encoder.transform(['D'])[0]]

# Ensure all columns are present even if a class wasn't predicted (though unlikely with proper training)
# It's good practice to ensure all required output columns are there
for col in ['Status_C', 'Status_CL', 'Status_D']:
    if col not in submission_df.columns:
        submission_df[col] = 0.0 # Fill with 0.0 if somehow missing

submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file created: submission.csv")
print(submission_df.head())


Status Label Encoding Mapping:
C: 0
CL: 1
D: 2

Features after scaling (Train DataFrame head):
   Cholesterol     Stage  Edema_Y      SGOT  Spiders_Y  Tryglicerides  \
0    -0.316911 -0.028390    False  0.099439      False      -1.008015   
1    -0.178669 -0.028390    False -0.939914      False      -0.658653   
2     0.862212 -0.028390    False  0.923753      False       0.873168   
3    -0.406362  1.119365    False -1.083273       True      -1.303630   
4     0.374299 -1.176144    False  0.421996      False       1.115034   

   Spiders_N  Drug_Placebo    Copper       Age  ...  Edema_S    N_Days  \
0       True         False -0.388831 -0.770118  ...    False  0.154784   
1       True         False -0.755908 -0.401461  ...    False  0.504209   
2       True          True -0.196553 -0.383028  ...    False  0.822891   
3      False          True -0.511190 -0.020427  ...    False  0.183278   
4       True          True  6.970182  0.606026  ...    False  0.213271   

   Bilirubin  Ascite