# Import Libraries

In [1]:
# %load_ext autoreload
# %reload_ext autoreload # This line is causing the error and can be removed.
# %autoreload 2 # This line is also causing an error and can be removed

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from IPython import display

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')



# Preprocessing and Fearture Engineering

## T·∫°o c·ªôt FamilySize = SibSp + Parch + 1

In [3]:
def create_family_size(df):
    return df["SibSp"] + df["Parch"] + 1

## M√£ h√≥a gi·ªõi t√≠nh: female=0, male=1

In [4]:
def encode_sex(df):
    cls_sex = {'female': 0, 'male': 1}
    return df["Sex"].map(cls_sex)

## ƒêi·ªÅn gi√° tr·ªã thi·∫øu c·ªßa c·ªôt Age b·∫±ng median

In [6]:
def fill_age(df):
    return df["Age"].fillna(df["Age"].median())

## Sao ch√©p c√°c c·ªôt c∆° b·∫£n gi·ªØ nguy√™n

In [7]:
def copy_basic_features(df, feature_list):
    return df[feature_list].copy()

## M√£ h√≥a Cabin: l·∫•y k√Ω t·ª± ƒë·∫ßu ti√™n v√† g√°n gi√° tr·ªã s·ªë

In [8]:
def encode_cabin(df):
    cls_cabin = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'Z':0}
    return df['Cabin'].apply(lambda x: cls_cabin['Z'] if pd.isna(x) else cls_cabin.get(x[0], cls_cabin['Z']))


## M√£ h√≥a Embarked: C=1, Q=2, S=3, NaN=0

In [9]:
def encode_embarked(df):
    cls_embarked = {'0': 0, 'C':1, 'Q':2, 'S':3}
    return df['Embarked'].apply(lambda x: cls_embarked['0'] if pd.isna(x) else cls_embarked.get(x, 0))

## Tr√≠ch xu·∫•t danh x∆∞ng (title) t·ª´ c·ªôt Name

In [10]:
def extract_surname(df):
    surnames = [
        'Capt.', 'Col.', 'Don.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.',
        'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.',
        'Rev.', 'Sir.', 'the', 'Dona.'
    ]
    cls_surnames = dict(zip(surnames, range(len(surnames))))
    def get_title(name):
        try:
            title = name.split(',')[1].split(' ')[1]
            return cls_surnames.get(title, -1)
        except Exception:
            return -1

    return df['Name'].apply(get_title)

## Hi·ªÉn th·ªã th√¥ng tin th·ªëng k√™ ƒë·ªÉ ki·ªÉm tra d·ªØ li·ªáu

In [11]:
def debug_info(df):
    
    print("=== DEBUG MODE ===")
    print("üîπ D·ªØ li·ªáu m·∫´u (head):")
    display.display(df.head(5))

    print("üîπ D·ªØ li·ªáu thi·∫øu:")
    display.display(df.isna().sum())

    print("üîπ Th·ªëng k√™ Age & Fare:")
    print(f"  Age missing: {df['Age'].isna().sum()}")
    print(f"  Age median: {df['Age'].median()}")
    display.display(df["Fare"].describe())

    print("üîπ Th·ªëng k√™ Cabin:")
    display.display(np.unique(
        df['Cabin'].apply(lambda x: 'Z0' if pd.isna(x) else x),
        return_counts=True
    ))

    print("üîπ Th·ªëng k√™ Embarked:")
    display.display(np.unique(
        df['Embarked'].apply(lambda x: '0' if pd.isna(x) else x),
        return_counts=True
    ))

## Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu ƒë·∫ßu v√†o v√† l·ª±a ch·ªçn ƒë·∫∑c tr∆∞ng cho m√¥ h√¨nh h·ªçc m√°y .

In [12]:
def preprocessing_feature_01(df_data, is_train = True, is_debug = True, **kwargs):
    
    df_output = pd.DataFrame()

    df_output["FamilySize"] = create_family_size(df_data)
    df_output["Sex"] = encode_sex(df_data)
    df_output["Age"] = fill_age(df_data)

    basic_features = copy_basic_features(df_data, ['Fare', 'Pclass'])
    df_output = pd.concat([df_output, basic_features], axis=1)

    df_output["Cabin"] = encode_cabin(df_data)
    df_output["Embarked"] = encode_embarked(df_data)
    df_output["Surname"] = extract_surname(df_data)

    # --- N·∫øu l√† t·∫≠p train, th√™m nh√£n ƒë·∫ßu ra ---
    if is_train:
        df_output["Output"] = df_data["Survived"]

    # --- In th√¥ng tin debug ---
    if is_debug:
        debug_info(df_data)

    return df_output, None

# df_train = pd.read_csv(f'{data_dir}/train.csv')

H√†m n√†y t·ªïng h·ª£p c√°c b∆∞·ªõc l√†m s·∫°ch v√† chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu,
gi√∫p d·ªØ li·ªáu s·∫µn s√†ng cho vi·ªác hu·∫•n luy·ªán m√¥ h√¨nh.

----------------------------------------------------------
üîπ C√°c b∆∞·ªõc x·ª≠ l√Ω:
1. T·∫°o ƒë·∫∑c tr∆∞ng m·ªõi `FamilySize` = SibSp + Parch + 1
2. M√£ h√≥a gi·ªõi t√≠nh (`Sex`) th√†nh s·ªë: female=0, male=1
3. ƒêi·ªÅn gi√° tr·ªã thi·∫øu cho `Age` b·∫±ng median (trung v·ªã)
4. Gi·ªØ l·∫°i c√°c c·ªôt c∆° b·∫£n: Fare, Pclass, SibSp, Parch
5. M√£ h√≥a Cabin ‚Üí k√Ω t·ª± ƒë·∫ßu ti√™n (A‚ÄìT) ‚Üí s·ªë; NaN ‚Üí Z=0
6. M√£ h√≥a c·ªïng l√™n t√†u (`Embarked`): C=1, Q=2, S=3, NaN=0
7. Tr√≠ch xu·∫•t danh x∆∞ng (`Surname`) t·ª´ c·ªôt Name
8. N·∫øu l√† t·∫≠p hu·∫•n luy·ªán ‚Üí th√™m c·ªôt `Output` (Survived)
9. N·∫øu b·∫≠t debug ‚Üí in ra th·ªëng k√™ d·ªØ li·ªáu

----------------------------------------------------------
Tham s·ªë:
- df_data : pandas.DataFrame  
    D·ªØ li·ªáu g·ªëc ch·ª©a c√°c c·ªôt Sex, Age, Fare, Pclass, Cabin, Embarked, Name, ...
- is_train : bool, m·∫∑c ƒë·ªãnh = True  
    N·∫øu True, th√™m c·ªôt "Output" = Survived.
- is_debug : bool, m·∫∑c ƒë·ªãnh = True  
    N·∫øu True, hi·ªÉn th·ªã th√¥ng tin ki·ªÉm tra d·ªØ li·ªáu.
- **kwargs : dict  
    Tham s·ªë m·ªü r·ªông cho t∆∞∆°ng lai (hi·ªán ch∆∞a s·ª≠ d·ª•ng).

----------------------------------------------------------
Gi√° tr·ªã tr·∫£ v·ªÅ:
- df_output : pandas.DataFrame  
    D·ªØ li·ªáu ƒë√£ ti·ªÅn x·ª≠ l√Ω, s·∫µn s√†ng cho m√¥ h√¨nh.
- None : placeholder ƒë·ªÉ t∆∞∆°ng th√≠ch pipeline.

----------------------------------------------------------

# Train

T·∫£i d·ªØ li·ªáu ƒë·∫∑c tr∆∞ng v√† hu·∫•n luy·ªán nhi·ªÅu m√¥ h√¨nh, hi·ªÉn th·ªã th√™m c√°c metrics.

In [13]:
def load_features(feat_path):
    print(f" Loading features from: {feat_path}")
    try:
        data = np.load(feat_path)
        X_train = data['X_train']
        y_train = data['y_train']
        X_test = data['X_test']
        X_cols = data['X_cols']
        print(f" Features loaded: {list(X_cols)}")
        return X_train, y_train, X_test, X_cols
    except Exception as e:
        print(f" Error loading {feat_path}: {e}")
        return None


In [14]:
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'KNeighbors': KNeighborsClassifier(),
    'SVC': SVC(probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42)
}

In [15]:
def train_and_evaluate_model_with_kfold(feat_path, seed=42):
    X_train, y_train, X_test, X_cols = load_features(feat_path)
    # Thi·∫øt l·∫≠p Stratified K-Fold
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    results = {}
    for model_name, model in models.items():
            print(f"\n--- Training {model_name} ---")
            fold_aucs = []
            all_y_val = []
            all_y_pred_proba = []
            all_y_pred = []


            for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
                # Ph√¢n chia d·ªØ li·ªáu
                X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
                y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

                # Hu·∫•n luy·ªán
                model.fit(X_train_fold, y_train_fold)

                # ƒê√°nh gi√°
                y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
                y_pred = model.predict(X_val_fold)

                fold_auc = roc_auc_score(y_val_fold, y_pred_proba)
                fold_aucs.append(fold_auc)

                all_y_val.extend(y_val_fold)
                all_y_pred_proba.extend(y_pred_proba)
                all_y_pred.extend(y_pred)


                print(f"  Fold {fold+1} AUC: {fold_auc:.4f}")
                # print(f"  Fold {fold+1} Accuracy: {accuracy_score(y_val_fold, y_pred):.4f}")
                # print(f"  Fold {fold+1} Classification Report:\n{classification_report(y_val_fold, y_pred)}")
                # print(f"  Fold {fold+1} Confusion Matrix:\n{confusion_matrix(y_val_fold, y_pred)}")


            mean_auc = np.mean(fold_aucs)
            std_auc = np.std(fold_aucs)
            print(f"-> Mean AUC for {model_name}: {mean_auc:.4f} +/- {std_auc:.4f}")

            # Calculate and print overall metrics across all folds
            overall_auc = roc_auc_score(all_y_val, all_y_pred_proba)
            overall_accuracy = accuracy_score(all_y_val, all_y_pred)
            overall_classification_report = classification_report(all_y_val, all_y_pred)
            overall_confusion_matrix = confusion_matrix(all_y_val, all_y_pred)
            fpr, tpr, thresholds = roc_curve(all_y_val, all_y_pred_proba)

            print(f"\nOverall Metrics for {model_name}:")
            print(f"  Overall AUC: {overall_auc:.4f}")
            print(f"  Overall Accuracy: {overall_accuracy:.4f}")
            print(f"  Overall Classification Report:\n{overall_classification_report}")
            print(f"  Overall Confusion Matrix:\n{overall_confusion_matrix}")


            results[model_name] = {
                "mean_auc": mean_auc,
                "std_auc": std_auc,
                "overall_auc": overall_auc,
                "overall_accuracy": overall_accuracy,
                "overall_classification_report": overall_classification_report,
                "overall_confusion_matrix": overall_confusion_matrix,
                "fpr": fpr,
                "tpr": tpr,
                "thresholds": thresholds
                }
    print("\n" + "="*20, "Training Complete", "="*20)
    return results


In [16]:
def train_and_evaluate_no_kfold(feat_path, seed):
    
    X_train, y_train, X_test, X_cols = load_features(feat_path)

    results = {}
    for model_name, model in models.items():
        print(f"\n--- Training {model_name} ---")

        # --- Hu·∫•n luy·ªán m√¥ h√¨nh ---
        model.fit(X_train, y_train)

        # --- D·ª± ƒëo√°n ---
        y_pred_proba = model.predict_proba(X_train)[:, 1]
        y_pred = model.predict(X_train)

        # --- T√≠nh to√°n c√°c ch·ªâ s·ªë ƒë√°nh gi√° ---
        auc = roc_auc_score(y_train, y_pred_proba)
        accuracy = accuracy_score(y_train, y_pred)
        report = classification_report(y_train, y_pred)
        conf_matrix = confusion_matrix(y_train, y_pred)
        fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba)

        # --- In k·∫øt qu·∫£ ---
        print(f"{model_name} Results:")
        print(f"AUC: {auc:.4f}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Classification Report:\n{report}")
        print(f"Confusion Matrix:\n{conf_matrix}")

        # --- L∆∞u k·∫øt qu·∫£ ---
        results[model_name] = {
            "auc": auc,
            "accuracy": accuracy,
            "classification_report": report,
            "confusion_matrix": conf_matrix,
            "fpr": fpr,
            "tpr": tpr,
            "thresholds": thresholds
        }

    print("\n" + "="*20, "Training Complete", "="*20)
    return results

# Main

In th√¥ng tin c∆° b·∫£n v·ªÅ c√°c c·ªôt trong train/test.

In [17]:
def print_data_info(df_train, df_test):

    print("-" * 10, "Dataset Information", "-" * 10)
    print(f"Train columns: {set(df_train.columns)}")
    print(f"Test columns:  {set(df_test.columns)}")
    print("Union:", set(df_train.columns).intersection(set(df_test.columns)))
    print("Difference:", set(df_train.columns).difference(set(df_test.columns)))


Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu train/test b·∫±ng h√†m preprocessing_feature_01.

In [18]:
def process_features(df_train, df_test, verbose=True):
    print("\n Processing training data...")
    df_train_feat, _ = preprocessing_feature_01(df_train, is_train=True, is_debug=verbose)

    print(" Processing test data...")
    df_test_feat, _ = preprocessing_feature_01(df_test, is_train=False, is_debug=verbose)

    y_train = df_train_feat['Output'].values
    X_train = df_train_feat.drop('Output', axis=1).values
    X_test = df_test_feat.values
    X_cols = df_train_feat.drop('Output', axis=1).columns.tolist()

    return X_train, y_train, X_test, X_cols


L∆∞u d·ªØ li·ªáu ƒë·∫∑c tr∆∞ng ƒë√£ x·ª≠ l√Ω v√†o file .npz.

In [19]:
def save_features(X_train, y_train, X_test, X_cols, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    feat_save_path = os.path.join(save_dir, 'data.npz')

    np.savez(
        feat_save_path,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        X_cols=X_cols
    )

    print(f" Features saved successfully to: {feat_save_path}")


H√†m ch√≠nh ƒëi·ªÅu ph·ªëi to√†n b·ªô pipeline Feature Engineering.

In [20]:
def run_feature_engineering(params_cfg):

    data_dir = params_cfg["data_dir"]
    save_dir = params_cfg["save_dir"]
    verbose = params_cfg.get("verbose", True)

    if verbose:
        print_data_info(df_train, df_test)

    X_train, y_train, X_test, X_cols = process_features(df_train, df_test, verbose=verbose)

    # --- Step 4: L∆∞u d·ªØ li·ªáu ---
    save_features(X_train, y_train, X_test, X_cols, save_dir)


Ch·∫°y hu·∫•n luy·ªán m√¥ h√¨nh sau khi ƒë√£ c√≥ file .npz.

In [24]:
def run_training(params_cfg):
    
    print("\n[ACTION]: Running Model Training (train)")

    feat_path = params_cfg["feat_path"]
    seed = params_cfg["seed"]

    if not os.path.exists(feat_path):
        print(f"‚ùå Error: Feature file not found at {feat_path}")
        print("üëâ Please run the 'main_feat01' action first to generate features.")
        return

    # G·ªçi h√†m train model
    training_results_kfold = train_and_evaluate_model_with_kfold(feat_path=feat_path, seed=seed)

    # --- T√≥m t·∫Øt k·∫øt qu·∫£ ---
    print("\n" + "=" * 20, "üìä Final Training Summary", "=" * 20)

    summary_data = [
        {
            "Model": name,
            "Overall Accuracy": m['overall_accuracy'],
            "Overall AUC": m['overall_auc'],
            "Mean Fold AUC": m.get('mean_auc', np.nan),
            "Std Fold AUC": m.get('std_auc', np.nan)
        }
        for name, m in training_results_kfold.items()
    ]

    summary_df = pd.DataFrame(summary_data).sort_values(by="Overall AUC", ascending=False)
    print(summary_df.to_string(index=False, float_format="%.4f"))


main - kh·ªëi th·ª±c thi

In [23]:
if __name__ == "__main__":

    params_cfg = {
        "action"   : "train",  # ho·∫∑c "train"
        "seed"     : 42,
        "exp_dir"  : os.path.abspath('./exps'),
        "exp_name" : "featbase_251028",
        "data_dir" : os.path.abspath("./"),  # train.csv/test.csv
        "verbose"  : True,
    }

    params_cfg.update({
        "save_dir": os.path.join(params_cfg["exp_dir"], params_cfg["exp_name"]),
        "feat_path": os.path.join(params_cfg["exp_dir"], params_cfg["exp_name"], "data.npz")
    })

    os.makedirs(params_cfg["save_dir"], exist_ok=True)

    for k, v in params_cfg.items():
        print(f"+ {k}: {v}")

    if params_cfg["action"] == "main_feat01":
        run_feature_engineering(params_cfg)
    elif params_cfg["action"] == "train":
        run_training(params_cfg)
    else:
        print(f"‚ùå Unknown action '{params_cfg['action']}'")


+ action: train
+ seed: 42
+ exp_dir: d:\Basic-of-Machine-learning\train\exps
+ exp_name: featbase_251028
+ data_dir: d:\Basic-of-Machine-learning\train
+ verbose: True
+ save_dir: d:\Basic-of-Machine-learning\train\exps\featbase_251028
+ feat_path: d:\Basic-of-Machine-learning\train\exps\featbase_251028\data.npz

[ACTION]: Running Model Training (train)
 Loading features from: d:\Basic-of-Machine-learning\train\exps\featbase_251028\data.npz
 Features loaded: [np.str_('FamilySize'), np.str_('Sex'), np.str_('Age'), np.str_('Fare'), np.str_('Pclass'), np.str_('Cabin'), np.str_('Embarked'), np.str_('Surname')]

--- Training LogisticRegression ---
  Fold 1 AUC: 0.8882
  Fold 2 AUC: 0.8500
  Fold 3 AUC: 0.8338
  Fold 4 AUC: 0.8294
  Fold 5 AUC: 0.8736
-> Mean AUC for LogisticRegression: 0.8550 +/- 0.0227

Overall Metrics for LogisticRegression:
  Overall AUC: 0.8539
  Overall Accuracy: 0.8002
  Overall Classification Report:
              precision    recall  f1-score   support

           

# End