<a href="https://www.kaggle.com/code/aneeshgrover/thapar-summer-school?scriptVersionId=186571852" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import log_loss

from scipy.stats import randint

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Loading the data

In [None]:
train = pd.read_csv('/kaggle/input/thapar-summer-school-2024/train.csv')
test = pd.read_csv('/kaggle/input/thapar-summer-school-2024/test.csv')
sample_submission = pd.read_csv('/kaggle/input/thapar-summer-school-2024/sample_submission.csv')

print("\nTrain Data : ")
print(train.head())

print("\nTest Data : ")
print(test.head())

print("\nSample Submission : ")
print(sample_submission.head())

# Exploring the Data

In [None]:
print("\nTrain Data Shape", train.shape)
print("Test Data Shapa", test.shape)

print("\nMissing Values in Train Data: ")
print(train.isnull().sum())

print("\nMissing Values in Test data: ")
print(train.isnull().sum())

print("/nTrain Data Summary")
print(train.describe())

## Visualization

### 1. Histogram

In [None]:
plt.figure(figsize = (8, 6))
plt.hist(train['Age']/365, bins = 30, edgecolor = 'black')
plt.xlabel('Age(Years)')
plt.ylabel('Frequency')
plt.title('Histogram of Age (Years)')
plt.show()

### 2. Count Plots

In [None]:
plt.figure(figsize = (8, 6))
sns.countplot(x = 'Sex', data = train)
plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Count of Male and Female')

# Preprocessing the Data

In [None]:
print(train.columns)

## Drop Unique Columns

In [None]:
train = train.drop(['id'], axis = 1)
test = test.drop(['id'], axis = 1)

## Separate Features and Target

In [None]:
X_train = train.drop(['Status'], axis = 1)
Y_train = train['Status']

print(X_train.columns)
print(Y_train.head())

## Label Encoding Categorial Values

In [None]:
categorialColumns = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

labelEncoders = {}
for column in categorialColumns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    test[column] = le.transform(test[column])
    labelEncoders[column] = le

print(X_train.head())
print(test.head())

label_encoder = LabelEncoder()
Y_train_encoded = label_encoder.fit_transform(Y_train)

In [None]:
X_train = pd.get_dummies(X_train)
test = pd.get_dummies(test)

for column in ['Sex_F', 'Sex_M']:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    test[column] = le.transform(test[column])

## Scaling the data

In [None]:
numericalColumnsTrain = X_train.select_dtypes(include = ['float64', 'int64']).columns
numericalColumnsTest = test.select_dtypes(include = ['float64', 'int64']).columns

scaler = StandardScaler()

X_train[numericalColumnsTrain] = scaler.fit_transform(X_train[numericalColumnsTrain])
test[numericalColumnsTest] = scaler.transform(test[numericalColumnsTest])

print(X_train.head())
print(test.head())

In [None]:
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, Y_train_encoded, test_size=0.2, random_state=42)

# Model Training

## Hyperparameter Tuning

In [None]:
def objective(trial):
    params = {
        'objective' : 'multi:softprob', 
        'eval_metric' : 'mlogloss', 
        'num_class' : len(set(Y_train)), 
        'booster' : 'gbtree', 
        'tree_method' : 'hist', 
        'max_depth' : trial.suggest_int('max_depth', 3, 10), 
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000), 
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5), 
        'subsample': trial.suggest_float('subsample', 0.5, 1.0), 
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0), 
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10.0),
    }
    
    model = XGBClassifier(**params, random_state = 42, use_label_encoder = False)
    model.fit(X_train_split, y_train_split, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10, verbose=False)
    
    Predictions = model.predict_proba(X_valid)
    loss = log_loss(y_valid, Predictions)
    
    return loss

study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 50, timeout = 3600)

print("Best Parameters Found : ")
print(study.best_params)

In [None]:
def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }
    
    rf_model = RandomForestClassifier(**params, random_state=42)
    rf_model.fit(X_train_split, y_train_split)
    
    Predictions = rf_model.predict_proba(X_valid)
    loss = log_loss(y_valid, Predictions)
    
    return loss

rf_study = optuna.create_study(direction='minimize')
rf_study.optimize(rf_objective, n_trials=50, timeout=3600)

print("Best Parameters Found for RandomForest:")
print(rf_study.best_params)


In [None]:
best_params = study.best_params
best_params_rf = rf_study.best_params
xgb_model = XGBClassifier(**best_params, random_state = 42, use_label_encoder = False)

rf_model = RandomForestClassifier(**best_params_rf, random_state = 42)

ensemble_model = VotingClassifier(estimators = [
    ('xgb', xgb_model), 
    ('rf', rf_model)
], voting = 'soft')



ensemble_model.fit(X_train, Y_train_encoded)

ensemblePredictions = ensemble_model.predict_proba(test)

#valuePredictions = best_model.predict_proba(test)


# Submission File

In [None]:
Submission = pd.DataFrame(ensemblePredictions, columns=["Status_C", "Status_CL", "Status_D"])
smth = pd.read_csv('/kaggle/input/thapar-summer-school-2024/test.csv')
Submission['id'] = smth['id']
Submission = Submission[['id', 'Status_C', 'Status_CL', 'Status_D']]

In [None]:
Submission.to_csv("submission.csv", index=False)
print("Submission file created successfully.")