In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Step 1: Load dataset
data = pd.read_csv('/content/Cleaned_Covid_Data.csv')  # Replace with your file path

# Step 2: Take 20k sample
data_sample = data.sample(n=20000, random_state=42)
data_sample.to_csv('covid_20k_sample.csv', index=False)

# Step 3: Prepare target
data_sample['CLASIFFICATION_FINAL'] = data_sample['CLASIFFICATION_FINAL'].apply(lambda x: 1 if x in [1,2,3] else 0)

X = data_sample.drop('CLASIFFICATION_FINAL', axis=1)
y = data_sample['CLASIFFICATION_FINAL']

# Step 4: Identify categorical and numeric columns
categorical_cols = ['USMER','MEDICAL_UNIT','SEX','PATIENT_TYPE','INTUBED',
                    'PNEUMONIA','PREGNANT','DIABETES','COPD','ASTHMA','INMSUPR',
                    'HIPERTENSION','OTHER_DISEASE','CARDIOVASCULAR','OBESITY',
                    'RENAL_CHRONIC','TOBACCO','ICU','DECEASED']
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 6: Preprocessing
# Separate scaler for numeric columns
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_numeric_scaled = scaler.transform(X_test[numeric_cols])

# OneHotEncoder for categorical columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_cat_encoded = encoder.transform(X_test[categorical_cols])

# Combine numeric + categorical
X_train_final = np.hstack([X_train_numeric_scaled, X_train_cat_encoded])
X_test_final = np.hstack([X_test_numeric_scaled, X_test_cat_encoded])

# Step 7: Apply SMOTE to balance classes
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train_final, y_train)

# Step 8: Train XGBoost
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train_balanced, y_train_balanced)

# Step 9: Evaluate
y_pred = xgb_model.predict(X_test_final)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 10: Save scaler+encoder and model separately
joblib.dump({'scaler': scaler, 'encoder': encoder}, 'covid_preprocessor.pkl')
joblib.dump(xgb_model, 'covid_best_model.pkl')

print("Scaler and encoder saved as covid_preprocessor.pkl")
print("XGBoost model saved as covid_best_model.pkl")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Confusion Matrix:
[[1456  596]
 [ 871 1077]]

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.71      0.66      2052
           1       0.64      0.55      0.59      1948

    accuracy                           0.63      4000
   macro avg       0.63      0.63      0.63      4000
weighted avg       0.63      0.63      0.63      4000

Scaler and encoder saved as covid_preprocessor.pkl
XGBoost model saved as covid_best_model.pkl
