In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.datasets import load_iris, make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
print("Setup complete")

In [None]:
# Load data
iris = load_iris()
X = iris.data[:, :2]
y = (iris.target == 0).astype(int)
print(f"Original shape: {X.shape}")

In [None]:
# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
print(f"Polynomial features shape: {X_poly.shape}")
print(f"Feature names: {poly.get_feature_names_out()}")

In [None]:
# Feature interactions
df = pd.DataFrame(X, columns=['Feature1', 'Feature2'])
df['Interaction'] = df['Feature1'] * df['Feature2']
df['Ratio'] = df['Feature1'] / (df['Feature2'] + 1)
df['Sum'] = df['Feature1'] + df['Feature2']
df['Product'] = df['Feature1'] * df['Feature2']
print(df.head())

In [None]:
# Binning
df['Binned'] = pd.cut(df['Feature1'], bins=5, labels=False)
print(df[['Feature1', 'Binned']].head())

In [None]:
# Log transformation
df['Log_F1'] = np.log1p(df['Feature1'])
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(df['Feature1'], bins=20, alpha=0.7)
plt.title('Original')
plt.subplot(1, 2, 2)
plt.hist(df['Log_F1'], bins=20, alpha=0.7)
plt.title('Log Transformed')
plt.tight_layout()
plt.show()

In [None]:
# Compare models
X_scaled = StandardScaler().fit_transform(X)
X_poly_scaled = StandardScaler().fit_transform(X_poly)

scores_original = cross_val_score(LinearRegression(), X_scaled, y, cv=5)
scores_poly = cross_val_score(LinearRegression(), X_poly_scaled, y, cv=5)

print(f"Original features R²: {scores_original.mean():.3f}")
print(f"Polynomial features R²: {scores_poly.mean():.3f}")

In [None]:
# Tests
test_results = []
test1 = X_poly.shape[1] > X.shape[1]
test_results.append(("Test 1: Polynomial increases features", test1, f"From {X.shape[1]} to {X_poly.shape[1]}"))

test2 = 'Interaction' in df.columns
test_results.append(("Test 2: Feature interaction created", test2, "Interaction column present"))

test3 = not np.isnan(df['Log_F1']).any()
test_results.append(("Test 3: Log transformation valid", test3, "No NaN values"))

test4 = len(scores_original) == 5 and len(scores_poly) == 5
test_results.append(("Test 4: Cross-validation works", test4, f"Scores: {len(scores_original)}"))

test5 = scores_poly.mean() > 0
test_results.append(("Test 5: Polynomial improves score", test5, f"Improvement: {scores_poly.mean() - scores_original.mean():.3f}"))

print("\n" + "="*60)
print("PRACTICAL 9: FEATURE ENGINEERING - TEST RESULTS")
print("="*60)
passed = sum(1 for _, result, _ in test_results if result)
for test_name, result, details in test_results:
    status = "✅ PASS" if result else "❌ FAIL"
    print(f"{status} | {test_name} | {details}")
print(f"\nTotal: {passed}/{len(test_results)} tests passed")
print("="*60)