In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
# import sys
# !{sys.executable} -m pip install catboost

In [3]:
df_boy = pd.read_csv('casp5_boy_processed.csv')
df_girl = pd.read_csv('casp5_girl_processed.csv')

<h1>BOYS</h1>

<h3>Find high correlated feature</h3>

In [4]:
def split_X_y(df_org):
    df = df_org.copy()
    X = df.drop(columns=['height_percentile_label', 'height'],axis=1)  # Drop height and label columns for X
    y = df['height_percentile_label']  # Target is the label based on height percentiles
    return X,y

In [5]:
def find_highly_correlated_features(X, threshold=0.8, plot=False):
    """
    Finds and optionally visualizes highly correlated features in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        threshold (float): The correlation threshold for identifying high correlations (default is 0.8).
        plot (bool): Whether to plot a heatmap of the correlation matrix (default is False).
    
    Returns:
        list of tuples: Pairs of highly correlated features and their correlation values.
    """
    # Compute the correlation matrix
    corr_matrix = X.corr()

    # Identify pairs of features with correlation above the threshold
    high_corr_pairs = [
        (i, j, corr_matrix.loc[i, j])
        for i in corr_matrix.columns
        for j in corr_matrix.columns
        if i != j and abs(corr_matrix.loc[i, j]) > threshold
    ]

    # Remove duplicate pairs (e.g., both (A, B) and (B, A))
    high_corr_pairs = list(set(tuple(sorted(pair[:2])) + (pair[2],) for pair in high_corr_pairs))
    high_corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

    # Optional: Visualize the correlation matrix
    if plot:
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
        plt.title("Correlation Matrix")
        plt.show()

    return high_corr_pairs
X,y = split_X_y(df_boy)
find_highly_correlated_features(X)

[('University ', 'id2           ', 0.9999950542471897),
 ('age  ', 'catage ', 0.9414744929864061),
 ('hp_102              ', 'hp_104              ', 0.9208794269034223),
 ('hp_103             ', 'hp_105             ', 0.8611226437958259),
 ('bmi1               ', 'weight              ', 0.844684792132679),
 ('cluster ', 'region ', 0.8398711765611132),
 ('s_106 ', 's_115 ', 0.8301259475862331),
 ('age  ', 'cross ', 0.825288413822319),
 ('weight              ', 'wrist              ', 0.8089198569244078),
 ('waist               ', 'weight              ', 0.8079443599610233)]

<h2>Tree based model</h2>

Models like Random Forests, Gradient Boosting (e.g., XGBoost, CatBoost), and Decision Trees are robust to correlated features.

catboost

In [6]:
def select_features(X, y, k=50,Active=False):
    if Active:
        selector = SelectKBest(mutual_info_classif, k=k)
        selector.fit(X, y)
        return X.columns[selector.get_support()].tolist()
    else:
        return X.columns.tolist()

In [28]:
def train_catboost(X_train, y_train, iterations=100, random_state=42):
    model = CatBoostClassifier(iterations=iterations, random_seed=random_state, verbose=0)
    model.fit(X_train, y_train)
    return model,iterations

def train_xgboost(X_train, y_train,n_estimators = 400,learning_rate = 0.1,max_depth = 10, random_state=42):
    model = XGBClassifier(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth,
                          use_label_encoder=True, eval_metric="logloss", random_state=random_state)
    model.fit(X_train, y_train)
    return model


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    return {"accuracy": accuracy, "report": report}


# Log MLflow experiment
def log_mlflow_experiment(model_name, model,iterations, selected_features, metrics, report):
    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("num_iterations",iterations)
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("selected_features", selected_features)
        
        # Log metrics
        mlflow.log_metric("accuracy", metrics["accuracy"])
        mlflow.log_metrics({
            "precision_Normal": report["Normal"]["precision"],
            "recall_Normal": report["Normal"]["recall"],

            "precision_Tall": report["Tall"]["precision"],
            "recall_Tall": report["Tall"]["recall"],

            "precision_Short": report["Short"]["precision"],
            "recall_Short": report["Short"]["recall"]
            
        })
        
        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        print(f"Run ID ({model_name}): {mlflow.active_run().info.run_id}")

In [29]:
def main():
# Set experiment
    mlflow.set_experiment("ٓXGBoost1")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 3: Select features
    selected_features = select_features(X_train, y_train, k=2)
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Step 4: Train CatBoost
    catboost_model,iterations = train_catboost(X_train_selected, y_train)
    catboost_metrics = evaluate_model(catboost_model, X_test_selected, y_test)
    log_mlflow_experiment("CatBoost", catboost_model,iterations, selected_features, catboost_metrics, catboost_metrics["report"])
    print(f"CatBoost Accuracy: {catboost_metrics['accuracy']:.2f}")


# Run the pipeline
if __name__ == "__main__":
    main()



Run ID (CatBoost): 217a5ce0e881410286d41797d8e44ae3
CatBoost Accuracy: 0.87
