In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('train.csv')
print(df.head())
print(df.info())
print(df.isnull().sum())

   tumor_type        size   location  edema  necrosis enhancement      shape  \
0   pituitary  khlat_3lik    frontal      1         0        none  irregular   
1      glioma  normal_brk    frontal      0         0        none  irregular   
2  metastatic  normal_brk  occipital      1         0        mild  irregular   
3  meningioma  normal_brk    frontal      1         1        none  irregular   
4  meningioma  normal_brk  brainstem      0         1        ring  irregular   

          margins  calcification  cystic_components  hemorrhage  ki67_index  \
0  poorly_defined              1                  0           0       100.0   
1    well_defined              0                  1           0        40.0   
2    well_defined              1                  0           0        95.0   
3  poorly_defined              1                  0           0       100.0   
4    well_defined              0                  0           0        25.0   

   mitotic_count  age  gender  symptoms_dura

In [None]:
# --- 1. Encode target labels ---
le = LabelEncoder()
df['cancer_stage'] = le.fit_transform(df['cancer_stage'])  # I‚Üí0, II‚Üí1, III‚Üí2, IV‚Üí3

# --- 2. One-hot encode categorical features ---
categorical_cols = ['tumor_type','size','location','enhancement','shape','margins','gender']
X = pd.get_dummies(df.drop(['cancer_stage','id'], axis=1), columns=categorical_cols)
y = df['cancer_stage']

# --- 3. Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# --- 4. Scale numeric features (important for some models) ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- 5. Import models ---
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
# --- 6. Define models ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "RidgeClassifier": RidgeClassifier(),
    "SGDClassifier": SGDClassifier(max_iter=1000, tol=1e-3),
    "GaussianNB": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "MLPClassifier": MLPClassifier(max_iter=1000)
}

# --- 7. Train and evaluate ---
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[name] = f1
    print(f"{name} F1 Score: {f1:.4f}")

# --- 8. Display sorted results ---
print("\nüèÜ Model F1-score ranking:")
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
for name, score in sorted_results:
    print(f"{name}: {score:.4f}")


Training LogisticRegression...
LogisticRegression F1 Score: 0.7753
Training RidgeClassifier...
RidgeClassifier F1 Score: 0.7341
Training SGDClassifier...
SGDClassifier F1 Score: 0.7418
Training GaussianNB...
GaussianNB F1 Score: 0.7320
Training KNN...
KNN F1 Score: 0.6617
Training SVC...
SVC F1 Score: 0.7655
Training DecisionTree...
DecisionTree F1 Score: 0.6440
Training RandomForest...
RandomForest F1 Score: 0.7176
Training ExtraTrees...
ExtraTrees F1 Score: 0.7056
Training GradientBoosting...
GradientBoosting F1 Score: 0.7528
Training XGBoost...
XGBoost F1 Score: 0.7718
Training MLPClassifier...
MLPClassifier F1 Score: 0.7158

üèÜ Model F1-score ranking:
LogisticRegression: 0.7753
XGBoost: 0.7718
SVC: 0.7655
GradientBoosting: 0.7528
SGDClassifier: 0.7418
RidgeClassifier: 0.7341
GaussianNB: 0.7320
RandomForest: 0.7176
MLPClassifier: 0.7158
ExtraTrees: 0.7056
KNN: 0.6617
DecisionTree: 0.6440


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# Scale data (LogisticRegression performs better this way)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Get feature importance
importance = abs(model.coef_).mean(axis=0)  # average across classes if multiclass
feat_imp = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feat_imp = feat_imp.sort_values('Importance', ascending=False)

print("\nTop important features:")
print(feat_imp.head(10))


F1 Score: 0.7745630699696595

Top important features:
                  Feature  Importance
6           mitotic_count    0.670615
5              ki67_index    0.322488
1                necrosis    0.237406
21     location_brainstem    0.186028
29       enhancement_none    0.184944
4              hemorrhage    0.182035
0                   edema    0.179337
30       enhancement_ring    0.177844
13  tumor_type_metastatic    0.170893
19        size_sghir_bzef    0.154748


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score

# 1Ô∏è‚É£ Load your dataset
df = pd.read_csv('train.csv')  # replace with your path

# 2Ô∏è‚É£ Define categorical columns (excluding gender)
categorical_cols = ['tumor_type', 'size', 'location', 'enhancement', 'shape', 'margins']

# 3Ô∏è‚É£ One-hot encode categorical columns and drop unnecessary columns
X = pd.get_dummies(df.drop(['cancer_stage', 'id', 'gender'], axis=1), columns=categorical_cols)
y = df['cancer_stage']

# 4Ô∏è‚É£ Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5Ô∏è‚É£ Split train/validation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# 6Ô∏è‚É£ Initialize Logistic Regression
model = LogisticRegression(solver='saga', max_iter=5000)

# 7Ô∏è‚É£ Test RFE for different number of features
results = []
best_f1 = 0
best_n_features = 0
best_features_idx = None

for n_features in range(5, X.shape[1]+1, 1):  # you can adjust step
    rfe = RFE(estimator=model, n_features_to_select=n_features, step=1)
    rfe.fit(X_train, y_train)

    # Evaluate F1 on validation set
    y_val_pred = rfe.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    results.append((n_features, f1))

    if f1 > best_f1:
        best_f1 = f1
        best_n_features = n_features
        best_features_idx = rfe.get_support(indices=True)

    print(f"Top {n_features} features ‚Üí F1 Score: {f1:.4f}")

# 8Ô∏è‚É£ Show best result
best_features = X.columns[best_features_idx].tolist()
print(f"\nüèÜ Best F1 Score: {best_f1:.4f} with top {best_n_features} features")
print("‚úÖ Features to use:", best_features)


Top 5 features ‚Üí F1 Score: 0.6782
Top 6 features ‚Üí F1 Score: 0.6897
Top 7 features ‚Üí F1 Score: 0.6992
Top 8 features ‚Üí F1 Score: 0.7056
Top 9 features ‚Üí F1 Score: 0.7188
Top 10 features ‚Üí F1 Score: 0.7247
Top 11 features ‚Üí F1 Score: 0.7223
Top 12 features ‚Üí F1 Score: 0.7285
Top 13 features ‚Üí F1 Score: 0.7319
Top 14 features ‚Üí F1 Score: 0.7374
Top 15 features ‚Üí F1 Score: 0.7507
Top 16 features ‚Üí F1 Score: 0.7504
Top 17 features ‚Üí F1 Score: 0.7539
Top 18 features ‚Üí F1 Score: 0.7557
Top 19 features ‚Üí F1 Score: 0.7566
Top 20 features ‚Üí F1 Score: 0.7606
Top 21 features ‚Üí F1 Score: 0.7635
Top 22 features ‚Üí F1 Score: 0.7635
Top 23 features ‚Üí F1 Score: 0.7687
Top 24 features ‚Üí F1 Score: 0.7718
Top 25 features ‚Üí F1 Score: 0.7711
Top 26 features ‚Üí F1 Score: 0.7736
Top 27 features ‚Üí F1 Score: 0.7807
Top 28 features ‚Üí F1 Score: 0.7807
Top 29 features ‚Üí F1 Score: 0.7836
Top 30 features ‚Üí F1 Score: 0.7821
Top 31 features ‚Üí F1 Score: 0.7821
Top 32

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# --- Assuming you already have feat_imp (feature importance) and X, y from before ---

results = []  # to store (num_features, f1_score)

for n in range(1, len(feat_imp) + 1):  # test using top 1 to all features
    top_features = feat_imp['Feature'].head(n).tolist()
    X_top = X[top_features]

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X_top, y, test_size=0.2, random_state=42, stratify=y)

    # Train model
    model = LogisticRegression(max_iter=5000, solver='saga')
    model.fit(X_train, y_train)

    # Predict and calculate F1
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    results.append((n, f1))

    print(f"Top {n} features ‚Üí F1 Score: {f1:.4f}")

# Convert to DataFrame for plotting
f1_results = pd.DataFrame(results, columns=['NumFeatures', 'F1Score'])

# Plot F1 score vs number of top features
plt.figure(figsize=(8, 5))
plt.plot(f1_results['NumFeatures'], f1_results['F1Score'], marker='o')
plt.xlabel('Number of Top Features Used')
plt.ylabel('Weighted F1 Score')
plt.title('F1 Score vs. Number of Top Features (Logistic Regression)')
plt.grid(True)
plt.show()

# Find the best number of features
best_n = f1_results.loc[f1_results['F1Score'].idxmax()]
best_features = feat_imp['Feature'].head(int(best_n['NumFeatures'])).tolist()
print(f"\nüèÜ Best result ‚Üí Top {int(best_n['NumFeatures'])} features with F1 Score = {best_n['F1Score']:.4f}")
print(f"‚úÖ Best features to use: {best_features}")

# --- Train final model using best features ---
X_best = X[best_features]
X_train, X_val, y_train, y_val = train_test_split(X_best, y, test_size=0.2, random_state=42, stratify=y)

final_model = LogisticRegression(max_iter=5000, solver='saga', multi_class='multinomial')
final_model.fit(X_train, y_train)

# Evaluate final model
y_val_pred = final_model.predict(X_val)
final_f1 = f1_score(y_val, y_val_pred, average='weighted')
print(f"\nüéØ Final model F1 Score on validation set: {final_f1:.4f}")


Top 1 features ‚Üí F1 Score: 0.6374
Top 2 features ‚Üí F1 Score: 0.6600
Top 3 features ‚Üí F1 Score: 0.6597
Top 4 features ‚Üí F1 Score: 0.6633
Top 5 features ‚Üí F1 Score: 0.6775
Top 6 features ‚Üí F1 Score: 0.6784
Top 7 features ‚Üí F1 Score: 0.6877
Top 8 features ‚Üí F1 Score: 0.6876
Top 9 features ‚Üí F1 Score: 0.7005
Top 10 features ‚Üí F1 Score: 0.7177
Top 11 features ‚Üí F1 Score: 0.7231
Top 12 features ‚Üí F1 Score: 0.7217
Top 13 features ‚Üí F1 Score: 0.7319
Top 14 features ‚Üí F1 Score: 0.7346
Top 15 features ‚Üí F1 Score: 0.7313
Top 16 features ‚Üí F1 Score: 0.7385
Top 17 features ‚Üí F1 Score: 0.7336
Top 18 features ‚Üí F1 Score: 0.7431
Top 19 features ‚Üí F1 Score: 0.7426
Top 20 features ‚Üí F1 Score: 0.7489
Top 21 features ‚Üí F1 Score: 0.7561
Top 22 features ‚Üí F1 Score: 0.7644
Top 23 features ‚Üí F1 Score: 0.7650
Top 24 features ‚Üí F1 Score: 0.7672
Top 25 features ‚Üí F1 Score: 0.7688
Top 26 features ‚Üí F1 Score: 0.7749
Top 27 features ‚Üí F1 Score: 0.7581
Top 28 fea

KeyError: "['gender_wa7ch'] not in index"

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# 1Ô∏è‚É£ Load train data
train_df = pd.read_csv('train.csv')  # Replace with your path

# 2Ô∏è‚É£ Define categorical columns (excluding gender)
categorical_cols = ['tumor_type', 'size', 'location', 'enhancement', 'shape', 'margins']

# 3Ô∏è‚É£ One-hot encode categorical features
X_full = pd.get_dummies(train_df.drop(['cancer_stage', 'id', 'gender'], axis=1),
                        columns=categorical_cols)
y = train_df['cancer_stage']

# 4Ô∏è‚É£ Select top 26 features from your previous selection
top_features = ['mitotic_count', 'ki67_index', 'necrosis', 'location_brainstem',
                'enhancement_none', 'hemorrhage', 'edema', 'enhancement_ring',
                'tumor_type_metastatic', 'size_sghir_bzef', 'size_khlat_3lik',
                'location_occipital', 'age', 'tumor_type_glioma', 'location_temporal',
                'enhancement_mild', 'enhancement_strong', 'size_sghira',
                'tumor_type_meningioma', 'location_cerebellum', 'cystic_components',
                'margins_well_defined', 'margins_poorly_defined', 'size_kbira',
                'location_frontal', 'kps_score']

X_top = X_full[top_features]

# 5Ô∏è‚É£ Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X_top, y, test_size=0.2, random_state=42, stratify=y
)

# 6Ô∏è‚É£ Define scalers to test
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

results = []

for name, scaler in scalers.items():
    # Scale features
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Train Logistic Regression
    model = LogisticRegression(C=10, solver='saga', max_iter=5000, multi_class='multinomial')
    model.fit(X_train_scaled, y_train)

    # Predict & compute F1 score
    y_val_pred = model.predict(X_val_scaled)
    f1 = f1_score(y_val, y_val_pred, average='weighted')

    results.append((name, f1))
    print(f"{name} ‚Üí Validation F1 Score: {f1:.4f}")

# 7Ô∏è‚É£ Summary
results_df = pd.DataFrame(results, columns=['Scaler', 'F1_Score']).sort_values(by='F1_Score', ascending=False)
print("\nüèÜ Best Scaler:")
print(results_df.head(1))




StandardScaler ‚Üí Validation F1 Score: 0.7765




MinMaxScaler ‚Üí Validation F1 Score: 0.7765
RobustScaler ‚Üí Validation F1 Score: 0.7765

üèÜ Best Scaler:
           Scaler  F1_Score
0  StandardScaler  0.776488




In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('train.csv')  # replace with your path

# Select only numeric columns for outlier detection
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

outlier_summary = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_summary[col] = len(outliers)

# Convert to DataFrame for better view
outlier_df = pd.DataFrame(list(outlier_summary.items()), columns=['Feature', 'Num_Outliers'])
outlier_df = outlier_df.sort_values(by='Num_Outliers', ascending=False)

print("Number of outliers per feature:")
print(outlier_df)


Number of outliers per feature:
                 Feature  Num_Outliers
0                  edema             0
1               necrosis             0
2          calcification             0
3      cystic_components             0
4             hemorrhage             0
5             ki67_index             0
6          mitotic_count             0
7                    age             0
8      symptoms_duration             0
9   neurological_deficit             0
10             kps_score             0
11                    id             0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# --- Define categorical columns (excluding gender) ---
categorical_cols = ['tumor_type', 'size', 'location', 'enhancement', 'shape', 'margins']

# --- One-hot encode categorical columns and drop unnecessary columns ---
X_full = pd.get_dummies(df.drop(['cancer_stage', 'id', 'gender'], axis=1),
                        columns=categorical_cols)
y = df['cancer_stage']  # target

# --- Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42, stratify=y
)

# --- Standardize features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Use the 26 best features ---
best_features = [
    'mitotic_count', 'ki67_index', 'necrosis', 'location_brainstem', 'enhancement_none',
    'hemorrhage', 'edema', 'enhancement_ring', 'tumor_type_metastatic', 'size_sghir_bzef',
    'size_khlat_3lik', 'location_occipital', 'age', 'tumor_type_glioma', 'location_temporal',
    'enhancement_mild', 'enhancement_strong', 'size_sghira', 'tumor_type_meningioma',
    'location_cerebellum', 'cystic_components', 'margins_well_defined', 'margins_poorly_defined',
    'size_kbira', 'location_frontal', 'kps_score'
]
top_indices = [X_full.columns.get_loc(f) for f in best_features]

X_train_top = X_train_scaled[:, top_indices]
X_test_top = X_test_scaled[:, top_indices]

# --- Logistic Regression with GridSearchCV ---
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'newton-cg', 'saga'],  # solvers supporting multinomial
    'max_iter': [500, 1000, 2000]
}

grid = GridSearchCV(
    LogisticRegression(multi_class='multinomial'),
    param_grid,
    scoring='f1_weighted',
    cv=5
)

grid.fit(X_train_top, y_train)

best_model = grid.best_estimator_

# --- Evaluate F1 score ---
y_pred = best_model.predict(X_test_top)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Best Logistic Regression params: {grid.best_params_}")
print(f"F1 Score with top 26 features: {f1:.4f}")




Best Logistic Regression params: {'C': 10, 'max_iter': 500, 'solver': 'lbfgs'}
F1 Score with top 26 features: 0.7765


In [None]:
# ========================================
# üì¶ Imports & Setup
# ========================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
import random, os, warnings
warnings.filterwarnings("ignore")

# ========================================
# üéØ Reproducibility
# ========================================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# ========================================
# 1Ô∏è‚É£ Load Data
# ========================================
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preserve test IDs
test_ids = test_df['id'].copy()

# Define categorical columns
categorical_cols = ['tumor_type', 'size', 'location', 'enhancement', 'shape', 'margins']

# ========================================
# 2Ô∏è‚É£ Preprocess Train Data
# ========================================
X_full = pd.get_dummies(train_df.drop(['cancer_stage', 'id', 'gender'], axis=1), columns=categorical_cols)
y = train_df['cancer_stage']

# Encode target if not numeric
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full)

# Select top 29 RFE features
top_features = [
    'edema', 'necrosis', 'calcification', 'cystic_components', 'hemorrhage', 'ki67_index',
    'mitotic_count', 'age', 'kps_score', 'tumor_type_glioma', 'tumor_type_meningioma',
    'tumor_type_metastatic', 'tumor_type_pituitary', 'size_kbira', 'size_khlat_3lik',
    'size_sghir_bzef', 'size_sghira', 'location_brainstem', 'location_cerebellum',
    'location_frontal', 'location_occipital', 'location_temporal', 'enhancement_mild',
    'enhancement_none', 'enhancement_ring', 'enhancement_strong', 'shape_regular',
    'margins_poorly_defined', 'margins_well_defined'
]
top_indices = [X_full.columns.get_loc(f) for f in top_features]
X_top = X_scaled[:, top_indices]

# Split train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X_top, y, test_size=0.2, random_state=SEED, stratify=y
)

# ========================================
# 3Ô∏è‚É£ Define Base Models
# ========================================
log_clf = LogisticRegression(
    C=10, solver='saga', max_iter=5000, multi_class='multinomial', random_state=SEED
)
rf_clf = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=SEED, n_jobs=-1)
xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    num_class=len(np.unique(y_train)),
    random_state=SEED,
    eval_metric='mlogloss'
)

# ========================================
# 4Ô∏è‚É£ Define Upgraded Meta-Model
# ========================================
meta_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    num_class=len(np.unique(y_train)),
    random_state=SEED,
    eval_metric='mlogloss'
)

# ========================================
# 5Ô∏è‚É£ Build Stacking Classifier
# ========================================
stack_model = StackingClassifier(
    estimators=[
        ('log_reg', log_clf),
        ('rf', rf_clf),
        ('xgb', xgb_clf)
    ],
    final_estimator=meta_model,
    n_jobs=-1,
    passthrough=True
)

# Train stacking model
stack_model.fit(X_train, y_train)

# ========================================
# 6Ô∏è‚É£ Evaluate Validation
# ========================================
y_val_pred = stack_model.predict(X_val)
val_f1 = f1_score(y_val, y_val_pred, average='weighted')
print(f"üéØ Stacking Model F1 Score on validation set: {val_f1:.4f}")
print("\nüìä Validation Report:\n", classification_report(y_val, y_val_pred))

# ========================================
# 7Ô∏è‚É£ Preprocess Test Data
# ========================================
X_test_full = pd.get_dummies(test_df.drop(['gender', 'id'], axis=1), columns=categorical_cols)
X_test_full = X_test_full.reindex(columns=X_full.columns, fill_value=0)
X_test_scaled = scaler.transform(X_test_full)
X_test_top = X_test_scaled[:, top_indices]

# Predict on test set
y_test_pred = stack_model.predict(X_test_top)

# Decode labels if LabelEncoder was used
if 'le' in locals():
    y_test_pred = le.inverse_transform(y_test_pred)

# ========================================
# 8Ô∏è‚É£ Create Submission
# ========================================
submission = pd.DataFrame({
    'id': test_ids,
    'cancer_stage': y_test_pred
})
submission.to_csv('submission.csv', index=False)
print("‚úÖ submission.csv created successfully!")


üéØ Stacking Model F1 Score on validation set: 0.8478

üìä Validation Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.84      0.48      0.61        96
           2       0.83      0.80      0.82       307
           3       0.88      0.98      0.93       947

    accuracy                           0.87      1400
   macro avg       0.64      0.56      0.59      1400
weighted avg       0.84      0.87      0.85      1400

‚úÖ submission.csv created successfully!
