In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Sample dataset
np.random.seed(42)
X = pd.DataFrame({
    'X1': np.random.rand(100),
    'X2': np.random.rand(100)
})
# Target variable (nonlinear relationship)
y = 3*X['X1'] + 2*X['X2'] + 4*X['X1']*X['X2'] + np.random.normal(0, 0.1, 100)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model with only original features
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

print("Model with original features:")
print(f"R² Score: {r2_score(y_test, y_pred1):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred1)):.4f}")

# Feature engineering: Polynomial and interaction terms
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

# Model with engineered features
model2 = LinearRegression()
model2.fit(X_poly_train, y_train)
y_pred2 = model2.predict(X_poly_test)

print("\nModel with polynomial + interaction features:")
print(f"R² Score: {r2_score(y_test, y_pred2):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred2)):.4f}")

# Show feature names
feature_names = poly.get_feature_names_out(X.columns)
print("\nEngineered Features:")
print(feature_names)


Model with original features:
R² Score: 0.9698
RMSE: 0.3425

Model with polynomial + interaction features:
R² Score: 0.9981
RMSE: 0.0855

Engineered Features:
['X1' 'X2' 'X1^2' 'X1 X2' 'X2^2']


In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------------- Decision Tree -------------------
dt_params = {
    'max_depth': [2, 4, 6, None],
    'criterion': ['gini', 'entropy']
}
dt_model = DecisionTreeClassifier(random_state=42)
dt_grid = GridSearchCV(dt_model, dt_params, cv=5, scoring='accuracy')
dt_grid.fit(X_train, y_train)

print("Best Decision Tree Params:", dt_grid.best_params_)
dt_best = dt_grid.best_estimator_
dt_preds = dt_best.predict(X_test)

print("\nDecision Tree Performance:")
print("Accuracy:", accuracy_score(y_test, dt_preds))
print(classification_report(y_test, dt_preds))

# ------------------- Random Forest -------------------
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [2, 4, 6, None],
    'criterion': ['gini', 'entropy']
}
rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)

print("\nBest Random Forest Params:", rf_grid.best_params_)
rf_best = rf_grid.best_estimator_
rf_preds = rf_best.predict(X_test)

print("\nRandom Forest Performance:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))


Best Decision Tree Params: {'criterion': 'gini', 'max_depth': 4}

Decision Tree Performance:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


Best Random Forest Params: {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 50}

Random Forest Performance:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

