# 07 — Feature Engineering & Selection
**Data Analysis Portfolio**

**Engineering:** encoding, scaling, polynomial, date features, interactions, transforms
**Selection:** variance threshold, correlation, SelectKBest, RFE, Random Forest importance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
np.random.seed(42)
print("All libraries loaded.")

## 1. Build Dataset

In [None]:
n = 500
df = pd.DataFrame({
    'age':          np.random.randint(22,60,n),
    'salary':       np.random.lognormal(10.8,0.4,n).round(0).clip(25000,200000),
    'experience':   np.random.randint(0,35,n),
    'num_projects': np.random.randint(1,20,n),
    'department':   np.random.choice(['IT','HR','Finance','Marketing'],n),
    'education':    np.random.choice(["Bachelor's","Master's","PhD"],n),
    'gender':       np.random.choice(['Male','Female'],n),
    'join_date':    pd.date_range('2015-01-01',periods=n,freq='3D'),
    'noise_1':      np.random.randn(n),
    'noise_2':      np.random.randn(n),
    'constant_col': 1,
})
log_odds = 0.03*df['age'] + 0.00001*df['salary'] + 0.05*df['experience'] + 0.1*df['num_projects'] - 3
df['promoted'] = (np.random.rand(n) < 1/(1+np.exp(-log_odds))).astype(int)
print("Shape:", df.shape, "| Promoted%:", round(df['promoted'].mean()*100,1))

## 2. Feature Engineering

In [None]:
fe = df.copy()
# Numeric transforms
fe['salary_log']         = np.log1p(fe['salary'])
fe['experience_squared'] = fe['experience'] ** 2

# Interaction features
fe['exp_per_age']         = (fe['experience'] / fe['age']).round(4)
fe['salary_per_project']  = (fe['salary'] / fe['num_projects']).round(2)
fe['productivity']        = (fe['num_projects'] * fe['salary'] / 1e6).round(4)

# Binning
fe['age_group']   = pd.cut(fe['age'],  bins=[18,30,40,50,65], labels=['Junior','Mid','Senior','Expert'])
fe['salary_tier'] = pd.qcut(fe['salary'], q=4, labels=['Q1','Q2','Q3','Q4'])

# Date features
fe['join_year']        = fe['join_date'].dt.year
fe['join_month']       = fe['join_date'].dt.month
fe['join_quarter']     = fe['join_date'].dt.quarter
fe['tenure_days']      = (pd.Timestamp('2024-01-01') - fe['join_date']).dt.days
fe['is_long_tenure']   = (fe['tenure_days'] > 1000).astype(int)

print(f"Features: {df.shape[1]} → {fe.shape[1]}  (+{fe.shape[1]-df.shape[1]} new)")
print(fe[['salary_log','exp_per_age','tenure_days','productivity']].head(3))

## 3. Encoding

In [None]:
enc = fe.copy()

# Label encode — binary
le = LabelEncoder()
enc['gender_label']      = le.fit_transform(enc['gender'])

# Ordinal encode — ordered categories
enc['education_ordinal'] = enc['education'].map({"Bachelor's":0,"Master's":1,"PhD":2})

# One-Hot encode — nominal
dept_ohe = pd.get_dummies(enc['department'], prefix='dept', drop_first=True)
enc = pd.concat([enc, dept_ohe], axis=1)

# Drop originals
enc = enc.drop(columns=['gender','department','education','join_date','age_group','salary_tier',
                          'noise_1','noise_2','constant_col'])

print("Shape after encoding:", enc.shape)
print("OHE columns:", [c for c in enc.columns if c.startswith('dept')])

## 4. Scaling Comparison

In [None]:
scale_cols = ['age','salary','experience','num_projects','tenure_days']
X_s = enc[scale_cols].dropna()

fig, axes = plt.subplots(len(scale_cols), 4, figsize=(16, 12))
fig.suptitle('Scaling Comparison', fontsize=13, fontweight='bold')

scalers = [('Original', None), ('MinMax', MinMaxScaler()), ('Standard', StandardScaler()), ('Robust', RobustScaler())]
for j, col in enumerate(scale_cols):
    for k, (name, scaler) in enumerate(scalers):
        data = X_s[[col]]
        vals = scaler.fit_transform(data).flatten() if scaler else data[col].values
        axes[j,k].hist(vals, bins=20, edgecolor='white',
                       color=['salmon','steelblue','mediumseagreen','mediumpurple'][k])
        axes[j,k].set_ylabel(col if k==0 else '', fontsize=8)
        if j==0: axes[j,k].set_title(name)

plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/07_scaling.png', dpi=100)
plt.show()

## 5. Feature Selection

In [None]:
X = enc.drop(columns=['promoted']).select_dtypes(include=[np.number]).fillna(0)
y = enc['promoted']
print("Features:", X.shape[1])
print(X.columns.tolist())

In [None]:
# METHOD 1 — Variance Threshold
vt = VarianceThreshold(threshold=0.01)
vt.fit(X)
dropped = X.columns[~vt.get_support()].tolist()
print(f"Variance Threshold — dropped {len(dropped)}: {dropped}")

In [None]:
# METHOD 2 — Correlation Filter
corr  = X.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
hi    = [c for c in upper.columns if any(upper[c] > 0.90)]
print(f"High correlation (>0.90): {hi}")

plt.figure(figsize=(12,9))
sns.heatmap(corr, cmap='coolwarm', linewidths=0.3, xticklabels=True, yticklabels=True)
plt.title('Feature Correlation Matrix')
plt.xticks(fontsize=7, rotation=45, ha='right')
plt.yticks(fontsize=7)
plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/07_corr.png', dpi=100)
plt.show()

In [None]:
# METHOD 3 — SelectKBest
skb      = SelectKBest(score_func=f_classif, k=8)
skb.fit(X, y)
selected = X.columns[skb.get_support()].tolist()
scores   = pd.Series(skb.scores_, index=X.columns).sort_values(ascending=False)
print("SelectKBest top 8:", selected)
print(scores.head(10).round(2))

In [None]:
# METHOD 4 — Random Forest Feature Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
imp.head(12).plot(kind='bar', color='steelblue', edgecolor='white')
plt.title('Random Forest — Feature Importance', fontsize=13, fontweight='bold')
plt.xticks(rotation=35, ha='right')
plt.tight_layout()
plt.savefig('/home/claude/data_analysis_portfolio/notebooks/07_importance.png', dpi=100)
plt.show()
print("Top 5:", imp.head(5).round(4).to_dict())

In [None]:
# METHOD 5 — RFE
rfe = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=8)
rfe.fit(X, y)
rfe_feats = X.columns[rfe.support_].tolist()
print("RFE selected:", rfe_feats)

## 6. Compare All Methods

In [None]:
rf_top8 = imp.head(8).index.tolist()
all3    = set(selected) & set(rf_top8) & set(rfe_feats)
print("="*55)
print("FEATURE SELECTION SUMMARY")
print("="*55)
print(f"Total features:        {X.shape[1]}")
print(f"SelectKBest:           {selected}")
print(f"RF Importance (top 8): {rf_top8}")
print(f"RFE:                   {rfe_feats}")
print(f"\nConsensus features (all 3 methods): {sorted(all3)}")
print("\n→ These are your MOST RELIABLE features for ML models.")

---
## ✅ Full Summary
| Category | Technique | Tool |
|----------|-----------|------|
| **Engineering** | Log/sqrt | `np.log1p()` |
| | Interaction | `col1/col2`, `col1*col2` |
| | Binning | `pd.cut()`, `pd.qcut()` |
| | Date features | `.dt.year`, tenure |
| **Encoding** | Binary | `LabelEncoder` |
| | Ordinal | manual `map()` |
| | Nominal | `pd.get_dummies()` |
| **Scaling** | Normalize | `MinMaxScaler` |
| | Standardize | `StandardScaler` |
| | Robust | `RobustScaler` |
| **Selection** | Variance | `VarianceThreshold` |
| | Correlation | manual filter |
| | Univariate | `SelectKBest` |
| | Wrapper | `RFE` |
| | Embedded | `RandomForestClassifier` |