In [2]:
# FeatureForge Titanic Example

import pandas as pd
import numpy as np

from feature_forge import FeatureSmith

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42


In [3]:
# Load dataset
df = pd.read_csv("data/train.csv")

print(df.shape)
df.head()


(418, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
target = "Survived"

features = [
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Ticket",
    "Fare",
    "Cabin",
    "Embarked",
    "Name"
]

X = df[features]
y = df[target]

print(f"Samples: {len(X)}")
print(f"Features: {len(X.columns)}")
print(f"Survival rate: {y.mean():.2%}")


Samples: 418
Features: 10
Survival rate: 36.36%


In [5]:
# Handle missing values
X["Age"] = X["Age"].fillna(X["Age"].median())
X["Embarked"] = X["Embarked"].fillna(X["Embarked"].mode()[0])
X["Cabin"] = X["Cabin"].fillna("Missing")
X["Ticket"] = X["Ticket"].fillna("Unknown")
X["Name"] = X["Name"].fillna("Unknown")


## Baseline model (before FeatureForge)

In [7]:
# Baseline: numeric features only
X_numeric = X.select_dtypes(include=[np.number])

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

baseline_scores = cross_val_score(
    rf, X_numeric, y,
    cv=5,
    scoring="accuracy"
)

print(f"Baseline Accuracy: {baseline_scores.mean():.4f} "
      f"(± {baseline_scores.std():.4f})")


Baseline Accuracy: 0.6101 (± 0.0556)


## Feature engineering with FeatureForge

In [8]:
smith = FeatureSmith(
    X=X,
    y=y,
    task="classification",
    verbose=True
)

X_augmented = smith.forge(
    strategies=[
        "encoding",       # Categorical encoding
        "interactions",   # Feature interactions
        "polynomial"      # Numeric transformations
    ],
    max_features=80,
    validate=True
)

print(f"Original features: {X.shape[1]}")
print(f"Total features after forge: {X_augmented.shape[1]}")


FeatureSmith initialized
  - Task: classification
  - Samples: 418
  - Original features: 10

Forging features with strategies: ['encoding', 'interactions', 'polynomial']

  Generating encoding features...
    ⚠ Removed 4 potentially leaky features
    Generated: 8 features
    Total new features: 8

  Generating interactions features...
    Generated: 46 features
    Total new features: 54

  Generating polynomial features...
PolynomialFeatures does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modu

In [12]:
# Impute missing numeric values (required for polynomial features)
numeric_cols = X.select_dtypes(include=[np.number]).columns

X[numeric_cols] = X[numeric_cols].fillna(
    X[numeric_cols].median()
)

In [13]:
smith = FeatureSmith(
    X=X,
    y=y,
    task="classification",
    verbose=True
)

X_augmented = smith.forge(
    strategies=[
        "encoding",       # Categorical encoding
        "interactions",   # Feature interactions
        "polynomial"      # Numeric transformations
    ],
    max_features=80,
    validate=True
)

print(f"Original features: {X.shape[1]}")
print(f"Total features after forge: {X_augmented.shape[1]}")


FeatureSmith initialized
  - Task: classification
  - Samples: 418
  - Original features: 10

Forging features with strategies: ['encoding', 'interactions', 'polynomial']

  Generating encoding features...
    ⚠ Removed 4 potentially leaky features
    Generated: 8 features
    Total new features: 8

  Generating interactions features...
    Generated: 46 features
    Total new features: 54

  Generating polynomial features...
    Generated: 15 features
    Total new features: 69

✓ Feature generation complete!
  - Original features: 10
  - New features: 69
  - Total features: 79
Original features: 10
Total features after forge: 79


In [14]:
ranked_features = smith.rank_features(
    model_type="rf",
    method="importance"
)

ranked_features.head(15)



Ranking 79 features using rf...
✓ Feature ranking complete!

Top 5 features:
  1. Sex: 0.4579
  2. Embarked_target_enc: 0.0219
  3. Name: 0.0211
  4. Ticket: 0.0204
  5. Fare_minus_Age: 0.0169


Unnamed: 0,feature,score
0,Sex,0.457864
1,Embarked_target_enc,0.021854
2,Name,0.021106
3,Ticket,0.020421
4,Fare_minus_Age,0.01693
5,poly_Fare_x_Pclass,0.015106
6,Cabin_target_enc,0.013957
7,Fare_div_Age,0.013686
8,Fare_plus_Pclass,0.013373
9,Fare_times_Age,0.012998


Build a numeric-only model matrix

In [17]:
# Numeric-only matrix for sklearn models
X_model = X_augmented.select_dtypes(include=[np.number])

In [18]:
TOP_K = 30
top_features = ranked_features.head(TOP_K)["feature"].tolist()


Filter your top features to numeric only

In [19]:
top_features_numeric = [
    f for f in top_features if f in X_model.columns
]

print(f"Using {len(top_features_numeric)} numeric features "
      f"out of {len(top_features)} selected")


Using 27 numeric features out of 30 selected


Model evaluation with engineered features

In [21]:
engineered_scores = cross_val_score(
    rf,
    X_model[top_features_numeric],
    y,
    cv=5,
    scoring="accuracy"
)

print(f"Engineered Accuracy: {engineered_scores.mean():.4f} "
      f"(± {engineered_scores.std():.4f})")

# Calculate improvement over baseline

improvement = (
    engineered_scores.mean() - baseline_scores.mean()
) / baseline_scores.mean() * 100

print(f"Improvement over baseline: {improvement:+.2f}%")


Engineered Accuracy: 0.6222 (± 0.0594)
Improvement over baseline: +1.98%


Redundancy removal

In [23]:
optimal_features = smith.remove_redundancy(
    X=X_model[top_features_numeric],
    threshold=0.95
)

print(f"Features after redundancy removal: {len(optimal_features)}")

final_scores = cross_val_score(
    rf,
    X_model[optimal_features],
    y,
    cv=5,
    scoring="accuracy"
)

print(f"Final Accuracy: {final_scores.mean():.4f}")



Removing redundant features (threshold=0.95)...
    Correlation threshold: 0.95
    Removed 15 redundant features
✓ Redundancy removal complete!
  - Original: 27 features
  - Removed: 15 redundant features
  - Remaining: 12 features
Features after redundancy removal: 12
Final Accuracy: 0.6293


## Generate FeatureForge report

In [24]:
summary = smith.get_summary()
summary

{'task': 'classification',
 'original_features': 10,
 'samples': 418,
 'target_type': 'int64',
 'strategies_used': ['encoding', 'interactions', 'polynomial'],
 'total_features': 79,
 'generated_features': 69,
 'features_scored': 79}

In [25]:
smith.generate_report("titanic_feature_report.html")


Generating feature report...
✓ Report saved to: titanic_feature_report.html
