In [1]:
# Import  libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, f_classif

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- RFE Example ---
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Use RFE to recursively eliminate features
rfe = RFE(estimator=rf_classifier, n_features_to_select=2)
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Print the selected features after RFE
selected_features_rfe = np.array(iris.feature_names)[rfe.support_]
print("Selected Features after RFE:", selected_features_rfe)

# --- SelectKBest Example ---
# Use SelectKBest to select the top 2 features based on F-statistic
kbest = SelectKBest(score_func=f_classif, k=2)
X_train_kbest = kbest.fit_transform(X_train, y_train)

# Print the selected features after SelectKBest
selected_features_kbest = np.array(iris.feature_names)[kbest.get_support()]
print("Selected Features after SelectKBest:", selected_features_kbest)


Selected Features after RFE: ['petal length (cm)' 'petal width (cm)']
Selected Features after SelectKBest: ['petal length (cm)' 'petal width (cm)']


In [2]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd




In [8]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []

    log_model = LogisticRegression(solver='lbfgs')
    RF = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    DT = DecisionTreeClassifier(criterion='gini', max_features='sqrt', splitter='best', random_state=0)
    svc_model = SVC(kernel='linear', random_state=0)

    rfemodellist = [log_model, svc_model, RF, DT]
    for i in rfemodellist:
        print(i)
        log_rfe = RFE(i, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append((log_rfe_feature, log_fit, i))

    return rfelist

def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        #X_train, X_test, y_train, y_test = train_test_split(indep_X,dep_Y, test_size = 0.25, random_state = 0)
        
        #Feature Scaling
        #from sklearn.preprocessing import StandardScaler
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        
        return X_train, X_test, y_train, y_test
    
def cm_prediction(classifier,X_test):
     y_pred = classifier.predict(X_test)
        
        # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(y_test, y_pred)
        
     from sklearn.metrics import accuracy_score 
     from sklearn.metrics import classification_report 
        #from sklearn.metrics import confusion_matrix
        #cm = confusion_matrix(y_test, y_pred)
        
     Accuracy=accuracy_score(y_test, y_pred )
        
     report=classification_report(y_test, y_pred)
     return  classifier,Accuracy,report,X_test,y_test,cm


def logistic(X_train, y_train, X_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)

    # Extracting feature importance for Logistic Regression
    feature_importance = classifier.coef_[0]

    classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm, feature_importance


def random(X_train, y_train, X_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)

    # Extracting feature importance for Random Forest
    feature_importance = classifier.feature_importances_

    classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm, feature_importance


def Decision(X_train, y_train, X_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)

    # Extracting feature importance for Decision Tree
    feature_importance = classifier.feature_importances_

    classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm, feature_importance


def rfe_classification(acclog, accrf, accdes, feature_importance_logistic, feature_importance_rf, feature_importance_decision):
    rfedataframe = pd.DataFrame(index=['Logistic', 'Random', 'DecisionTree'],
                                columns=['Accuracy', 'Feature_Importance'])

    for number, idex in enumerate(rfedataframe.index):
        rfedataframe['Accuracy'][idex] = [acclog[number], accrf[number], accdes[number]]
        rfedataframe['Feature_Importance'][idex] = [feature_importance_logistic[number],
                                                     feature_importance_rf[number],
                                                     feature_importance_decision[number]]

    return rfedataframe




In [6]:
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)

indep_X=df2.drop('classification_yes', 1)
dep_Y=df2['classification_yes']

In [9]:

# Example of using the modified functions
rfelist = rfeFeature(indep_X, dep_Y, 4)

acclog = []
accrf = []
accdes = []
feature_importance_logistic = []
feature_importance_rf = []
feature_importance_decision = []

for i in rfelist:
    X_train, X_test, y_train, y_test = split_scalar(i[0], dep_Y)

    # Extract feature importance for Logistic Regression
    logistic_classifier = i[2]
    if isinstance(logistic_classifier, LogisticRegression):
        classifier, Accuracy, report, X_test, y_test, cm, importance_logistic = logistic(X_train, y_train, X_test)
        acclog.append(Accuracy)
        feature_importance_logistic.append(importance_logistic)
    else:
        acclog.append(None)
        feature_importance_logistic.append(None)

    # Extract feature importance for Random Forest
    rf_classifier = i[2]
    if isinstance(rf_classifier, RandomForestClassifier):
        classifier, Accuracy, report, X_test, y_test, cm, importance_random = random(X_train, y_train, X_test)
        accrf.append(Accuracy)
        feature_importance_rf.append(importance_random)
    else:
        accrf.append(None)
        feature_importance_rf.append(None)

    # Extract feature importance for Decision Tree
    dt_classifier = i[2]
    if isinstance(dt_classifier, DecisionTreeClassifier):
        classifier, Accuracy, report, X_test, y_test, cm, importance_decision = Decision(X_train, y_train, X_test)
        accdes.append(Accuracy)
        feature_importance_decision.append(importance_decision)
    else:
        accdes.append(None)
        feature_importance_decision.append(None)

# You can access feature importance lists for each algorithm and each iteration: 
# feature_importance_logistic, feature_importance_rf, feature_importance_decision




LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

SVC(kernel='linear', random_state=0)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)


In [10]:
# You can also create a dataframe to store the results
result_df = rfe_classification(acclog, accrf, accdes, feature_importance_logistic, feature_importance_rf,
                                feature_importance_decision)
print(result_df)

                        Accuracy  \
Logistic      [0.95, None, None]   
Random        [None, None, None]   
DecisionTree  [None, 0.97, None]   

                                             Feature_Importance  
Logistic      [[2.030199866124361, 1.8843760679656576, 2.050...  
Random                                       [None, None, None]  
DecisionTree  [None, [0.10151051464073124, 0.289967813225265...  
