<a href="https://colab.research.google.com/github/Eezzeldin/candy/blob/main/candy_interdependence_cv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

# Load your dataset
df = pd.read_csv('/content/candy-data.csv')  # Replace with your dataset path

# Define binary and continuous feature columns
binary_columns = df.select_dtypes(include='int64').columns.drop('bar')
continuous_columns = ['sugarpercent', 'pricepercent', 'winpercent']
binary_and_continuous_columns = binary_columns.append(pd.Index(continuous_columns))

# Hyperparameter grid for RandomForest
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Function to compute tuned feature importances and balanced accuracy
def compute_tuned_accuracy_with_continuous(target_name):
    predictors = binary_and_continuous_columns.drop(target_name)
    X = df[predictors]
    y = df[target_name]

    # Set up RandomForest and cross-validation
    rf = RandomForestClassifier(random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(rf, param_grid, n_iter=30, cv=cv, scoring='balanced_accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X, y)

    # Best tuned model
    best_rf = random_search.best_estimator_

    # Cross-validated predictions using the tuned model
    y_pred_cv = cross_val_predict(best_rf, X, y, cv=cv)

    # Compute balanced accuracy
    balanced_acc = balanced_accuracy_score(y, y_pred_cv)

    return dict(zip(predictors, best_rf.feature_importances_)), balanced_acc

# Compute tuned balanced accuracy with continuous variables
tuned_results_with_continuous = {target: compute_tuned_accuracy_with_continuous(target) for target in binary_and_continuous_columns if target in binary_columns}

# Separate the feature importances and balanced accuracies into two DataFrames
tuned_importance_df_with_continuous = pd.DataFrame({target: tuned_results_with_continuous[target][0] for target in binary_columns}).T
tuned_balanced_accuracies_with_continuous = {target: tuned_results_with_continuous[target][1] for target in binary_columns}
tuned_balanced_accuracy_df_with_continuous = pd.DataFrame.from_dict(tuned_balanced_accuracies_with_continuous, orient='index', columns=['Adjusted Balanced Accuracy'])

tuned_importance_df_sorted_with_continuous = tuned_importance_df_with_continuous.apply(lambda row: row.sort_values(ascending=False), axis=1)

# Output the final DataFrames
print("Tuned Feature Importances:\n", tuned_importance_df_sorted_with_continuous)
print("\nTuned Balanced Accuracy:\n", tuned_balanced_accuracy_df_with_continuous)


Tuned Feature Importances:
                    caramel  chocolate  crispedricewafer    fruity      hard  \
chocolate         0.010642        NaN          0.011811  0.366652  0.019740   
fruity            0.046265   0.547767          0.000000       NaN  0.051303   
caramel                NaN   0.010334          0.039703  0.033515  0.024455   
peanutyalmondy    0.010226   0.006266          0.020801  0.008178  0.000000   
nougat            0.204385   0.026460          0.008357  0.015392  0.001401   
crispedricewafer  0.036639   0.003008               NaN  0.000000  0.000000   
hard              0.010838   0.075100          0.000951  0.168695       NaN   
pluribus          0.081530   0.091013          0.024926  0.059408  0.026977   

                    nougat  peanutyalmondy  pluribus  pricepercent  \
chocolate         0.011966        0.026648  0.033424      0.118534   
fruity            0.011144        0.039166  0.005296      0.047640   
caramel           0.105702        0.021846  0.0384

In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict, StratifiedKFold , LeaveOneOut
from sklearn.metrics import balanced_accuracy_score , make_scorer
from sklearn.linear_model import LogisticRegression


def load_dataset(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)


def get_features(df, exclude_feature='bar', continuous_features=None):
    """Prepare feature columns, excluding the specified feature."""
    binary_columns = df.select_dtypes(include='int64').columns.drop(exclude_feature)
    return binary_columns.append(pd.Index(continuous_features))


def get_hyperparameter_grid():
    """Define the hyperparameter grid for RandomForestClassifier."""

    params = {
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }
    params_overfitting = {
        'n_estimators': [50, 100, 200, 500],  # Lower upper limit of estimators
        'max_depth': [10, 15, 20],  # Limit maximum depth of trees
        'min_samples_split': [5, 10, 15],  # Require more samples to split a node
        'min_samples_leaf': [4, 6, 8],  # Require more samples per leaf
        'max_features': ['sqrt', 'log2']  # Limit number of features considered at each split
    }

    param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],  # Only solvers that support l1 and elasticnet
    'max_iter': [100, 200, 500]
    }

    return param_grid



def compute_tuned_feature_importance_and_accuracy(df, predictors, target_name, param_grid):
    """Compute tuned feature importances and balanced accuracy for a specific target."""
    X = df[predictors]
    y = df[target_name]

    rf = RandomForestClassifier(random_state=42)
    log_reg = LogisticRegression(random_state=42)
    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)



    # Custom scorer for balanced accuracy with adjustment
    adjusted_balanced_accuracy_scorer = make_scorer(balanced_accuracy_score, adjusted=True)
    random_search = RandomizedSearchCV(log_reg, param_grid, n_iter=100, cv=cv,
                                       scoring= adjusted_balanced_accuracy_scorer, n_jobs=-1,
                                       random_state=42)
    random_search.fit(X, y)

    best_rf = random_search.best_estimator_

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    #cv = LeaveOneOut ()
    y_pred_cv = cross_val_predict(best_rf, X, y, cv=cv)

    return dict(zip(predictors, best_rf.feature_importances_)), \
           balanced_accuracy_score(y, y_pred_cv,adjusted=True)


def run_analysis(file_path, exclude_feature='bar', continuous_features=None):
    """Run analysis to compute tuned feature importances and balanced accuracy."""
    df = load_dataset(file_path)
    all_features = get_features(df, exclude_feature=exclude_feature,
                                continuous_features=continuous_features)
    param_grid = get_hyperparameter_grid()

    results = {target: compute_tuned_feature_importance_and_accuracy(
        df, all_features.drop(target), target, param_grid)
        for target in all_features if target in df.select_dtypes(include='int64').columns}

    tuned_importance_df = pd.DataFrame({target: results[target][0] for target in results}).T
    tuned_balanced_accuracy_df = pd.DataFrame.from_dict(
        {target: results[target][1] for target in results}, orient='index',
        columns=['Adjusted Balanced Accuracy'])

    tuned_importance_df_sorted = tuned_importance_df.apply(lambda row: row.sort_values(ascending=False), axis=1)

    return tuned_importance_df_sorted, tuned_balanced_accuracy_df

'''
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="RandomForest Hyperparameter Tuning")
    parser.add_argument('file_path', type=str, help='Path to the dataset CSV file')
    parser.add_argument('--exclude_feature', type=str, default='bar', help='Feature to exclude')
    parser.add_argument('--continuous_features', nargs='+', default=['sugarpercent', 'pricepercent', 'winpercent'], help='List of continuous features')

    args = parser.parse_args()
    importance_df, accuracy_df = run_analysis(args.file_path, args.exclude_feature, args.continuous_features)

    print("Tuned Feature Importances:\n", importance_df)
    print("\nTuned Balanced Accuracy:\n", accuracy_df)
'''


'\nif __name__ == "__main__":\n    import argparse\n\n    parser = argparse.ArgumentParser(description="RandomForest Hyperparameter Tuning")\n    parser.add_argument(\'file_path\', type=str, help=\'Path to the dataset CSV file\')\n    parser.add_argument(\'--exclude_feature\', type=str, default=\'bar\', help=\'Feature to exclude\')\n    parser.add_argument(\'--continuous_features\', nargs=\'+\', default=[\'sugarpercent\', \'pricepercent\', \'winpercent\'], help=\'List of continuous features\')\n\n    args = parser.parse_args()\n    importance_df, accuracy_df = run_analysis(args.file_path, args.exclude_feature, args.continuous_features)\n\n    print("Tuned Feature Importances:\n", importance_df)\n    print("\nTuned Balanced Accuracy:\n", accuracy_df)\n'

In [42]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, make_scorer
import numpy as np


def load_dataset(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)


def get_features(df, exclude_feature='bar', continuous_features=None):
    """Prepare feature columns, excluding the specified feature."""
    binary_columns = df.select_dtypes(include='int64').columns.drop(exclude_feature)
    return binary_columns.append(pd.Index(continuous_features))


def get_hyperparameter_grid():
    """Define the hyperparameter grid for LogisticRegression."""
    return {
        'penalty': ['l2', 'none','l1'],
        'C': [0.01, 0.1, 1, 10, 100,1000],
        'solver': ['lbfgs', 'liblinear'],
        'max_iter': [100, 200, 500,1000]
    }


def compute_tuned_feature_importance_and_accuracy(df, predictors, target_name, param_grid):
    """Compute tuned feature importances and balanced accuracy for a specific target."""
    X = df[predictors]
    y = df[target_name]

    log_reg = LogisticRegression(random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    adjusted_balanced_accuracy_scorer = make_scorer(balanced_accuracy_score, adjusted=True)

    random_search = RandomizedSearchCV(
        log_reg, param_grid, n_iter=100, cv=cv,
        scoring=adjusted_balanced_accuracy_scorer, n_jobs=-1, random_state=42)
    random_search.fit(X, y)

    best_log_reg = random_search.best_estimator_
    y_pred_cv = cross_val_predict(best_log_reg, X, y, cv=cv)

    feature_importance = np.abs(best_log_reg.coef_).flatten()
    feature_importance_dict = dict(zip(predictors, feature_importance))
    balanced_accuracy = balanced_accuracy_score(y, y_pred_cv , adjusted=True)

    print ("best_log_reg" , best_log_reg)
    return feature_importance_dict, balanced_accuracy


def run_analysis(file_path, exclude_feature='bar', continuous_features=None):
    """Run analysis to compute tuned feature importances and balanced accuracy."""
    df = load_dataset(file_path)
    df ["winpercent"] = df ["winpercent"] / 100
    c1  = df ["chocolate"] == 0
    c2  = df ["fruity"]   == 0
    c   = c1 & c2
    df ["other"] = np.where (c , 1, 0)
    all_features = get_features(df, exclude_feature=exclude_feature,
                                continuous_features=continuous_features)
    param_grid = get_hyperparameter_grid()

    results = {target: compute_tuned_feature_importance_and_accuracy(
        df, all_features.drop(target), target, param_grid)
        for target in all_features if target in df.select_dtypes(include='int64').columns}

    tuned_importance_df = pd.DataFrame({target: results[target][0] for target in results}).T
    tuned_balanced_accuracy_df = pd.DataFrame.from_dict(
        {target: results[target][1] for target in results}, orient='index',
        columns=['Adjusted Balanced Accuracy'])

    tuned_importance_df_sorted = tuned_importance_df.apply(lambda row: row.sort_values(ascending=False), axis=1)

    return tuned_importance_df_sorted, tuned_balanced_accuracy_df


'''
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Logistic Regression Hyperparameter Tuning")
    parser.add_argument('file_path', type=str, help='Path to the dataset CSV file')
    parser.add_argument('--exclude_feature', type=str, default='bar', help='Feature to exclude')
    parser.add_argument('--continuous_features', nargs='+', default=['sugarpercent', 'pricepercent', 'winpercent'], help='List of continuous features')

    args = parser.parse_args()
    importance_df, accuracy_df = run_analysis(args.file_path, args.exclude_feature, args.continuous_features)

    print("Tuned Feature Importances:\n", importance_df)
    print("\nTuned Balanced Accuracy:\n", accuracy_df)
'''


'\nif __name__ == "__main__":\n    import argparse\n\n    parser = argparse.ArgumentParser(description="Logistic Regression Hyperparameter Tuning")\n    parser.add_argument(\'file_path\', type=str, help=\'Path to the dataset CSV file\')\n    parser.add_argument(\'--exclude_feature\', type=str, default=\'bar\', help=\'Feature to exclude\')\n    parser.add_argument(\'--continuous_features\', nargs=\'+\', default=[\'sugarpercent\', \'pricepercent\', \'winpercent\'], help=\'List of continuous features\')\n\n    args = parser.parse_args()\n    importance_df, accuracy_df = run_analysis(args.file_path, args.exclude_feature, args.continuous_features)\n\n    print("Tuned Feature Importances:\n", importance_df)\n    print("\nTuned Balanced Accuracy:\n", accuracy_df)\n'

In [53]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
import numpy as np


def load_dataset(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)


def get_features(df, exclude_feature='bar', continuous_features=None):
    """Prepare feature columns, excluding the specified feature."""
    binary_columns = df.select_dtypes(include='int64').columns.drop(exclude_feature)
    return binary_columns.append(pd.Index(continuous_features))


def get_hyperparameter_grid():
    """Define the hyperparameter grid for LogisticRegression."""
    return {
        'poly__degree': [1, 2, 3],  # Polynomial degree
        'log_reg__penalty': ['l2', 'none','l1'],
        'log_reg__C': [0.01, 0.1, 1, 10, 100,1000],
        'log_reg__solver': ['lbfgs', 'liblinear'],
        'log_reg__max_iter': [100, 200, 500,1000]
    }


def compute_tuned_feature_importance_and_accuracy(df, predictors, target_name, param_grid):
    """Compute tuned feature importances and balanced accuracy for a specific target."""
    X = df[predictors]
    y = df[target_name]

    pipeline = Pipeline([
        ('poly', PolynomialFeatures()),  # Polynomial features transformation
        ('log_reg', LogisticRegression(random_state=42))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    adjusted_balanced_accuracy_scorer = make_scorer(balanced_accuracy_score, adjusted=True)

    random_search = RandomizedSearchCV(
        pipeline, param_grid, n_iter=20, cv=cv,
        scoring=adjusted_balanced_accuracy_scorer, n_jobs=-1, random_state=42)
    random_search.fit(X, y)

    best_pipeline = random_search.best_estimator_
    y_pred_cv = cross_val_predict(best_pipeline, X, y, cv=cv)

    log_reg = best_pipeline.named_steps['log_reg']
    poly = best_pipeline.named_steps['poly']
    feature_importance = np.abs(log_reg.coef_).flatten()
    feature_names = poly.get_feature_names_out(predictors)
    feature_importance_dict = dict(zip(feature_names, feature_importance))
    balanced_accuracy = balanced_accuracy_score(y, y_pred_cv,adjusted=True)

    return feature_importance_dict, balanced_accuracy


def run_analysis(file_path, exclude_feature='bar', continuous_features=None):
    """Run analysis to compute tuned feature importances and balanced accuracy."""
    df = load_dataset(file_path)
    df ["winpercent"] = df ["winpercent"] / 100
    c1  = df ["chocolate"] == 0
    c2  = df ["fruity"]   == 0
    c   = c1 & c2
    df ["other"] = np.where (c , 1, 0)
    all_features = get_features(df, exclude_feature=exclude_feature,
                                continuous_features=continuous_features)
    param_grid = get_hyperparameter_grid()

    results = {target: compute_tuned_feature_importance_and_accuracy(
        df, all_features.drop(target), target, param_grid)
        for target in all_features if target in df.select_dtypes(include='int64').columns}

    tuned_importance_df = pd.DataFrame({target: results[target][0] for target in results}).T
    tuned_balanced_accuracy_df = pd.DataFrame.from_dict(
        {target: results[target][1] for target in results}, orient='index',
        columns=['Adjusted Balanced Accuracy'])

    tuned_importance_df_sorted = tuned_importance_df.apply(lambda row: row.sort_values(ascending=False), axis=1)

    return tuned_importance_df_sorted, tuned_balanced_accuracy_df

'''
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Logistic Regression Hyperparameter Tuning with Polynomial Features")
    parser.add_argument('file_path', type=str, help='Path to the dataset CSV file')
    parser.add_argument('--exclude_feature', type=str, default='bar', help='Feature to exclude')
    parser.add_argument('--continuous_features', nargs='+', default=['sugarpercent', 'pricepercent', 'winpercent'], help='List of continuous features')

    args = parser.parse_args()
    importance_df, accuracy_df = run_analysis(args.file_path, args.exclude_feature, args.continuous_features)

    print("Tuned Feature Importances:\n", importance_df)
    print("\nTuned Balanced Accuracy:\n", accuracy_df)
'''


'\nif __name__ == "__main__":\n    import argparse\n\n    parser = argparse.ArgumentParser(description="Logistic Regression Hyperparameter Tuning with Polynomial Features")\n    parser.add_argument(\'file_path\', type=str, help=\'Path to the dataset CSV file\')\n    parser.add_argument(\'--exclude_feature\', type=str, default=\'bar\', help=\'Feature to exclude\')\n    parser.add_argument(\'--continuous_features\', nargs=\'+\', default=[\'sugarpercent\', \'pricepercent\', \'winpercent\'], help=\'List of continuous features\')\n\n    args = parser.parse_args()\n    importance_df, accuracy_df = run_analysis(args.file_path, args.exclude_feature, args.continuous_features)\n\n    print("Tuned Feature Importances:\n", importance_df)\n    print("\nTuned Balanced Accuracy:\n", accuracy_df)\n'

In [6]:
from cat_interdependent import run_analysis


dataset_path = 'candy-data.csv'  # Adjust the path to your dataset

continuous_features = ['sugarpercent', 'pricepercent', 'winpercent']
continuous_features = []

importance_df, accuracy_df = run_analysis(
    dataset_path, exclude_feature='bar', continuous_features=continuous_features)

print("Tuned Feature Importances:\n", importance_df)
print("\nTuned Balanced Accuracy:\n", accuracy_df)


Tuned Feature Importances:
                    caramel  chocolate  crispedricewafer    fruity      hard  \
chocolate         0.035575        NaN          0.072477  0.611252  0.081340   
fruity            0.087206   0.697198          0.000000       NaN  0.105499   
caramel                NaN   0.130167          0.126342  0.172397  0.072941   
peanutyalmondy    0.146773   0.247790          0.107190  0.181951  0.038995   
nougat            0.220773   0.222022          0.166490  0.037028  0.018673   
crispedricewafer  0.193589   0.230000               NaN  0.026449  0.016760   
hard              0.111925   0.273881          0.000000  0.435349       NaN   
pluribus          0.145502   0.295507          0.077730  0.146063  0.066086   

                    nougat  peanutyalmondy  pluribus  
chocolate         0.040939        0.091770  0.066647  
fruity            0.012491        0.073652  0.023953  
caramel           0.213021        0.129181  0.155951  
peanutyalmondy    0.178522             N

In [8]:
get_features(df, exclude_feature='bar', continuous_features=['sugarpercent', 'pricepercent', 'winpercent'])

Index(['chocolate', 'fruity', 'caramel', 'peanutyalmondy', 'nougat',
       'crispedricewafer', 'hard', 'pluribus', 'sugarpercent', 'pricepercent',
       'winpercent'],
      dtype='object')

In [9]:
from cat_interdependent import run_analysis


dataset_path = 'candy-data.csv'  # Adjust the path to your dataset

continuous_features = ['sugarpercent', 'pricepercent', 'winpercent']
#continuous_features = []

importance_df, accuracy_df = run_analysis(
    dataset_path, exclude_feature='bar', continuous_features=continuous_features)

print("Tuned Feature Importances:\n", importance_df)
print("\nTuned Balanced Accuracy:\n", accuracy_df)


Tuned Feature Importances:
                    caramel  chocolate  crispedricewafer    fruity      hard  \
chocolate         0.010642        NaN          0.011811  0.366652  0.019740   
fruity            0.046265   0.547767          0.000000       NaN  0.051303   
caramel                NaN   0.010334          0.039703  0.033515  0.024455   
peanutyalmondy    0.010226   0.006266          0.020801  0.008178  0.000000   
nougat            0.204385   0.026460          0.008357  0.015392  0.001401   
crispedricewafer  0.036639   0.003008               NaN  0.000000  0.000000   
hard              0.010838   0.075100          0.000951  0.168695       NaN   
pluribus          0.081530   0.091013          0.024926  0.059408  0.026977   

                    nougat  peanutyalmondy  pluribus  pricepercent  \
chocolate         0.011966        0.026648  0.033424      0.118534   
fruity            0.011144        0.039166  0.005296      0.047640   
caramel           0.105702        0.021846  0.0384

In [54]:
dataset_path = '/content/candy-data.csv'  # Adjust the path to your dataset

continuous_features = ['sugarpercent', 'pricepercent', 'winpercent']
continuous_features = []

importance_df, accuracy_df  = run_analysis(
    dataset_path, exclude_feature=[], continuous_features=continuous_features)

print("Tuned Feature Importances:\n", importance_df)
print("\nTuned Balanced Accuracy:\n", accuracy_df)


45 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 71, in _check_

Tuned Feature Importances:
                          1       bar  bar other  bar other^2  bar pluribus  \
chocolate         0.005188  0.003054        NaN          NaN           NaN   
fruity            1.519387  0.456415   0.013749     0.013749           0.0   
caramel           0.496446  0.055549   0.352581     0.352581           0.0   
peanutyalmondy    0.661497  0.115362   0.653396     0.653396           0.0   
nougat            1.360241  0.789387   0.596136     0.596136           0.0   
crispedricewafer  1.338748  0.390238   0.002751     0.002751           0.0   
hard              0.542782  0.434670   0.035955          NaN           0.0   
bar               0.000000       NaN        NaN          NaN           NaN   
pluribus          0.339768  3.625760        NaN          NaN           NaN   
other             1.058268  0.214208        NaN          NaN           0.0   

                  bar pluribus other  bar pluribus^2     bar^2  bar^2 other  \
chocolate                        N

45 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 71, in _check_

In [55]:
dataset_path = '/content/candy-data.csv'  # Adjust the path to your dataset

continuous_features = ['sugarpercent', 'pricepercent', 'winpercent']
#continuous_features = []

importance_df, accuracy_df = run_analysis(
    dataset_path, exclude_feature=[], continuous_features=continuous_features)

print("Tuned Feature Importances:\n", importance_df)
print("\nTuned Balanced Accuracy:\n", accuracy_df)


45 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 71, in _check_

Tuned Feature Importances:
                          1       bar  bar other  bar other pricepercent  \
chocolate         0.511609  0.661454        NaN                     NaN   
fruity            1.298833  0.272864   0.005178                0.003972   
caramel           2.223207  8.407119   5.008732                     NaN   
peanutyalmondy    4.727834  4.803084   4.884373                     NaN   
nougat            1.223848  0.581072   0.476208                0.365252   
crispedricewafer  5.744561  0.324956   0.000387                     NaN   
hard              0.029046  2.111739        NaN                     NaN   
bar               0.000000       NaN        NaN                     NaN   
pluribus          0.000017  2.283980        NaN                     NaN   
other             1.067434  0.113134        NaN                     NaN   

                  bar other sugarpercent  bar other winpercent  bar other^2  \
chocolate                            NaN                   NaN     

45 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 71, in _check_

In [12]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict
from sklearn.metrics import balanced_accuracy_score

class IterativeFeatureImportance(BaseEstimator, ClassifierMixin):
    def __init__(self, param_grid=None, n_iter=30):
        self.param_grid = param_grid or {
            'n_estimators': [50, 100, 200, 500],
            'max_depth': [None, 10, 20, 30, 40, 50],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }
        self.n_iter = n_iter
        self.results_ = {}

    def fit(self, X, y=None):
        for target in X.columns:
            X_subset = X.drop(columns=[target])
            y_subset = X[target]

            rf = RandomForestClassifier(random_state=42)
            random_search = RandomizedSearchCV(rf, self.param_grid, n_iter=self.n_iter, cv=5, scoring='balanced_accuracy', n_jobs=-1, random_state=42)
            random_search.fit(X_subset, y_subset)
            best_rf = random_search.best_estimator_

            y_pred_cv = cross_val_predict(best_rf, X_subset, y_subset, cv=5)
            balanced_acc = balanced_accuracy_score(y_subset, y_pred_cv)

            self.results_[target] = {
                'model': best_rf,
                'feature_importances': best_rf.feature_importances_,
                'balanced_accuracy': balanced_acc
            }

        return self

    def get_results(self):
        return self.results_


In [None]:
import pandas as pd

# Load dataset and specify continuous features
df = pd.read_csv('/content/candy-data.csv')  # Adjust path to dataset
continuous_features = ['sugarpercent', 'pricepercent', 'winpercent']

# Drop the feature that you don't want to include, e.g., 'bar'
features_to_include = df.select_dtypes(include='int64').columns.drop('bar').tolist()
all_features = features_to_include + continuous_features

# Filter only the columns to include in the analysis
X = df[all_features]

# Initialize the iterative feature importance estimator
iterative_importance = IterativeFeatureImportance()

# Fit the estimator to the dataset
iterative_importance.fit(X)

# Get the results
results = iterative_importance.get_results()

# Display results
for target, result in results.items():
    print(f"Target: {target}")
    print(f"Balanced Accuracy: {result['balanced_accuracy']}")
    print(f"Feature Importances: {result['feature_importances']}\n")
