In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Generate the synthetic dataset
X, y = make_classification(
    n_samples=1000, 
    n_features=20, 
    n_informative=2, 
    n_redundant=10,
    n_clusters_per_class=1, 
    weights=[0.99], 
    flip_y=0, 
    random_state=1
)

# Convert the dataset into a DataFrame
df = pd.DataFrame(X, columns=[f'Feature_{i}' for i in range(X.shape[1])])
df['Target'] = y

# Step 2: Create polynomial and interaction features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(df.drop(columns=['Target']))
poly_feature_names = poly.get_feature_names_out(df.drop(columns=['Target']).columns)
df_poly = pd.DataFrame(X_poly, columns=poly_feature_names)
df_poly['Target'] = y

# Separate features and target
X_poly = df_poly.drop(columns=['Target'])
y_poly = df_poly['Target']

# Step 3: Scale the features to the range [0, 1]
scaler = MinMaxScaler()
X_poly_scaled = scaler.fit_transform(X_poly)

# Step 4: Feature Selection
# SelectKBest with ANOVA F-value
select_k_best = SelectKBest(f_classif, k=20)
X_kbest = select_k_best.fit_transform(X_poly_scaled, y_poly)
selected_kbest_features = X_poly.columns[select_k_best.get_support()]
print(f"Selected features using SelectKBest: {selected_kbest_features}")

# Recursive Feature Elimination (RFE) with Logistic Regression
model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=20)
X_rfe = rfe.fit_transform(X_poly_scaled, y_poly)
selected_rfe_features = X_poly.columns[rfe.get_support()]
print(f"Selected features using RFE: {selected_rfe_features}")

# Step 5: Model Building and Evaluation
def build_and_evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_poly_scaled, y_poly, test_size=0.3, random_state=1)
X_train_kbest, X_test_kbest, y_train_kbest, y_test_kbest = train_test_split(X_poly[selected_kbest_features], y, test_size=0.3, random_state=1)
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(X_poly[selected_rfe_features], y, test_size=0.3, random_state=1)

# Baseline model with all features
print("Baseline Model Performance with All Features:")
baseline_model = DecisionTreeClassifier(random_state=1)
build_and_evaluate_model(X_train, X_test, y_train, y_test, baseline_model)

# Model with SelectKBest features
print("\nModel Performance with SelectKBest Features:")
model_kbest = DecisionTreeClassifier(random_state=1)
build_and_evaluate_model(X_train_kbest, X_test_kbest, y_train_kbest, y_test_kbest, model_kbest)

# Model with RFE features

print("\nModel Performance with RFE Features:")
model_rfe = DecisionTreeClassifier(random_state=1)
build_and_evaluate_model(X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe, model_rfe)

# Visualize the correlation matrix of the features
corr_matrix = pd.DataFrame(X_poly_scaled, columns=poly_feature_names).corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

