In [None]:


# Function to create new features and evaluate
def evaluate_combinations(features, n):
    results = []
    combs = list(combinations(features, n))
    
    for comb in combs:
        # Combine selected features
        combined_feature_name = "_".join(comb) + "_interaction"
        combined_feature = improve_data[list(comb)].prod(axis=1)  # Example: Product of features
        
        # Add the new feature to the dataset
        improve_data[combined_feature_name] = combined_feature

        # Correlation with Price
        correlation = improve_data[combined_feature_name].corr(improve_data['Price'])

        # Add the new feature to the training and testing sets
        X_new = improve_data.drop(columns=['Price'])
        y_new = improve_data['Price']
        X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

        # Retrain the model with the new feature
        best_model.fit(X_train_new, y_train_new)

        # SHAP Analysis
        explainer = shap.Explainer(best_model, X_train_new)
        shap_values = explainer(X_test_new)
        new_feature_index = list(X_train_new.columns).index(combined_feature_name)
        shap_importance = np.abs(shap_values[:, new_feature_index].values).mean()  # Compute SHAP importance

        # Store results
        results.append({
            'Combination': comb,
            'Correlation with Price': correlation,
            'SHAP Importance': shap_importance
        })

        # Drop the newly added feature to avoid overlap in further iterations
        improve_data.drop(columns=[combined_feature_name], inplace=True)

    # Convert results to DataFrame and sort by SHAP importance
    results_df = pd.DataFrame(results).sort_values(by='SHAP Importance', ascending=False)
    return results_df

# הגדרת אפשרויות התצוגה של pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

# Evaluate all 2-feature combinations
features = improve_data.drop(columns=['Price']).columns
results_2_comb = evaluate_combinations(features, 2)
print("Top 2-Feature Combinations by SHAP Importance:")
print(results_2_comb.head(10))

# Sort by Correlation with Price and print top 10
results_2_comb_sorted_by_corr = results_2_comb.sort_values(by='Correlation with Price', ascending=False)
print("\nTop 2-Feature Combinations by Correlation with Price:")
print(results_2_comb_sorted_by_corr.head(10))


# Evaluate all 3-feature combinations
results_3_comb = evaluate_combinations(features, 3)
print("\nTop 3-Feature Combinations by SHAP Importance:")
print(results_3_comb.head(10))

# Sort by Correlation with Price and print top 10
results_3_comb_sorted_by_corr = results_3_comb.sort_values(by='Correlation with Price', ascending=False)
print("\nTop 3-Feature Combinations by Correlation with Price:")
print(results_3_comb_sorted_by_corr.head(10))

In [None]:
from itertools import combinations
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import xgboost as xgb
import pandas as pd
import numpy as np

# Function to evaluate combinations of newly engineered features
def evaluate_engineered_combinations(data, base_features, target_column):
    best_combination = None
    best_mae = float('inf')
    best_metrics = {}

    # Generate all 2 and 3 feature combinations
    engineered_features = []
    for feature1, feature2 in combinations(base_features, 2):
        engineered_features.append((feature1, feature2, 'multiply'))
        engineered_features.append((feature1, feature2, 'divide'))
    
    for feature1, feature2, feature3 in combinations(base_features, 3):
        engineered_features.append((feature1, feature2, feature3, 'multiply'))
        
    for combination in engineered_features:
        # Create a copy of the data and add the engineered features
        temp_data = data.copy()

        if len(combination) == 3 and combination[2] == 'multiply':
            feature_name = f"{combination[0]}_{combination[1]}_mult"
            temp_data[feature_name] = temp_data[combination[0]] * temp_data[combination[1]]
        elif len(combination) == 3 and combination[2] == 'divide':
            feature_name = f"{combination[0]}_{combination[1]}_div"
            temp_data[feature_name] = temp_data[combination[0]] / (temp_data[combination[1]] + 1e-6)
        elif len(combination) == 4 and combination[3] == 'multiply':
            feature_name = f"{combination[0]}_{combination[1]}_{combination[2]}_mult"
            temp_data[feature_name] = temp_data[combination[0]] * temp_data[combination[1]] * temp_data[combination[2]]

        # Drop specific features to maintain consistency
        temp_data.drop(columns=['Company', 'Inches', 'Weight', 'TypeName'], inplace=True)

        # Scale numeric features except for the target
        scaler = MinMaxScaler()
        numeric_features = temp_data.select_dtypes(include=[np.number]).drop(columns=['Price']).columns
        temp_data[numeric_features] = scaler.fit_transform(temp_data[numeric_features])

        # Scale numeric features except for the target
        scaler = StandardScaler()
        numeric_features = temp_data.select_dtypes(include=[np.number]).drop(columns=['Price']).columns
        temp_data[numeric_features] = scaler.fit_transform(temp_data[numeric_features])

        # Train-test split
        X = temp_data.drop(columns=['Price'])
        y = temp_data['Price']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train the model
        xgb_model = xgb.XGBRegressor(random_state=42, enable_categorical=True)
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
        grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=3, verbose=0, n_jobs=-1)
        grid_search.fit(X_train, y_train)


        # Evaluate the model
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

        # Update the best combination if this is better
        if mae < best_mae:
            best_mae = mae
            best_combination = combination
            best_metrics = {
                'MAE': mae,
                'R2': r2,
                'RMSE': RMSE
            }

    return best_combination, best_metrics

# Define the base features for engineering
base_features = improve_data.columns.tolist()
base_features.remove('Price')

# Evaluate combinations
best_combination, best_metrics = evaluate_engineered_combinations(improve_data, base_features, 'Price')

print("Best Combination of Engineered Features:", best_combination)
print("Performance Metrics:", best_metrics)
