<a href="https://colab.research.google.com/github/21-LAKSHMI/AI-AND-ML/blob/main/RFE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
import pickle
import matplotlib.pyplot as plt

In [2]:
# Read the preprocessed CSV file
dataset = pd.read_csv('/content/Iris.csv')

# Inspect the DataFrame
print(dataset.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [3]:
dataset.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [4]:
indep_X=dataset[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]

In [5]:
dep_Y=dataset[['Species']]

In [6]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        return X_train, X_test, y_train, y_test

In [7]:
def accuracy_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [8]:
def Logistic(X_train, y_train, X_test, y_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    accuracy = accuracy_prediction(classifier, X_test, y_test)
    return accuracy

In [9]:
def Random_Forest_Classifier(X_train, y_train, X_test, y_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    classifier.fit(X_train, y_train)
    accuracy = accuracy_prediction(classifier, X_test, y_test)
    return accuracy

In [10]:
def KNN_Classifier(X_train, y_train, X_test, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    # You might want to tune the number of neighbors (n_neighbors)
    classifier = KNeighborsClassifier(n_neighbors=5)
    classifier.fit(X_train, y_train)
    accuracy = accuracy_prediction(classifier, X_test, y_test)
    return accuracy

In [11]:
def SVM_Classifier(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVC
    # You might want to tune the kernel and other parameters
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    accuracy = accuracy_prediction(classifier, X_test, y_test)
    return accuracy

In [12]:
def DecisionTree(X_train, y_train, X_test, y_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    accuracy = accuracy_prediction(classifier, X_test, y_test)
    return accuracy

In [32]:
def rfeFeature_classification(indep_X, dep_Y, n):
    rfelist = []
    colnames_list = []
    accuracy_values = []

    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0)

    from sklearn.tree import DecisionTreeClassifier
    dec = DecisionTreeClassifier(random_state=0)

    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=10, random_state=0)

    from sklearn.svm import SVC
    svm = SVC(kernel='linear', random_state=0)

    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=2)

    # Added KNN back, but note that RFE with KNN might not work as expected
    # as KNN does not have coef_ or feature_importances_ attributes
    rfemodellist = [log, dec, rf, svm, knn]

    # Numerically encode the target variable before the loop
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    dep_Y_encoded = le.fit_transform(dep_Y.values.ravel())
    # Convert the encoded target variable back to a DataFrame with a column name
    dep_Y_encoded_df = pd.DataFrame(dep_Y_encoded, columns=['Species_Encoded'])


    for model in rfemodellist:
        # Ensure n_features_to_select is not greater than the number of features
        n_features = indep_X.shape[1]
        n_select = min(n, n_features)
        if n_select == 0:
            n_select = 1 # Ensure at least one feature is selected

        # Handle potential issue with RFE and models without feature importances/coefficients
        try:
            log_rfe = RFE(estimator=model, n_features_to_select=n_select)
            # Fit RFE with the original independent variables and the encoded dependent variable
            log_fit = log_rfe.fit(indep_X, dep_Y_encoded)
            log_rfe_feature = log_fit.transform(indep_X)
            selected_columns = [col for col, selected in zip(indep_X.columns, log_rfe.support_) if selected]
        except (AttributeError, ValueError) as e:
            print(f"Could not perform RFE with {type(model).__name__}: {e}")
            # If RFE fails, use all features
            log_rfe_feature = indep_X.values
            selected_columns = indep_X.columns.tolist()


        rfelist.append(log_rfe_feature)
        colnames_list.append(selected_columns)

        # Use split_scalar with the RFE selected features and the encoded dependent variable DataFrame
        # Ensure the selected features are in a DataFrame with correct column names for split_scalar
        X_train, X_test, y_train, y_test = split_scalar(pd.DataFrame(log_rfe_feature, columns=selected_columns), dep_Y_encoded_df)

        # Reshape y_train and y_test to 1D arrays for fitting and evaluation
        model.fit(X_train, y_train.values.ravel())
        accuracy = accuracy_prediction(model, X_test, y_test.values.ravel())
        accuracy_values.append(accuracy)

    return rfelist, colnames_list, accuracy_values

# Example usage (assuming you have classification data in indep_X and dep_Y)
rfelist, colnames_list, accuracy_values = rfeFeature_classification(indep_X, dep_Y, 2)

# Print the selected column names and accuracy values for each model
# Ensure model names list matches the rfemodellist
for model_name, selected_columns, accuracy_value in zip(["Logistic", "Decision", "Random", "SVM", "KNN"], colnames_list, accuracy_values):
     print(f"Model: {model_name}")
     print("Selected Columns:", selected_columns)
     print(f"Accuracy Value: {accuracy_value}\n")

Could not perform RFE with KNeighborsClassifier: when `importance_getter=='auto'`, the underlying estimator KNeighborsClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.
Model: Logistic
Selected Columns: ['PetalLengthCm', 'PetalWidthCm']
Accuracy Value: 0.9473684210526315

Model: Decision
Selected Columns: ['PetalLengthCm', 'PetalWidthCm']
Accuracy Value: 0.9473684210526315

Model: Random
Selected Columns: ['PetalLengthCm', 'PetalWidthCm']
Accuracy Value: 0.9736842105263158

Model: SVM
Selected Columns: ['PetalLengthCm', 'PetalWidthCm']
Accuracy Value: 0.9736842105263158

Model: KNN
Selected Columns: ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
Accuracy Value: 0.9473684210526315



In [14]:
# Assuming you have classification data in indep_X_cls and dep_Y_cls
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)

from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion='gini', splitter='best', random_state=0) # Common criteria are 'gini' or 'entropy'
classifier_dt.fit(X_train, y_train)

# # To evaluate the model using the accuracy_prediction function
accuracy_dt = accuracy_prediction(classifier_dt, X_test, y_test)
print(f"Decision Tree Classifier Accuracy: {accuracy_dt}")

Decision Tree Classifier Accuracy: 0.9777777777777777


In [15]:
y_pred = classifier_dt.predict(X_test)

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have the true labels (y_test_cls) and predicted labels (y_pred_cls) for your classification task

# Calculate Precision
precision = precision_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"F1-score: {f1}")

Precision: 0.9796296296296295
Recall: 0.9777777777777777
F1-score: 0.9779434092477569


In [17]:
# Assuming you have classification data in indep_X_cls and dep_Y_cls
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)

from sklearn.ensemble import RandomForestClassifier
classifier_dt = RandomForestClassifier(criterion='gini', random_state=0) # Removed splitter='best' as it's not a valid parameter for RandomForestClassifier
classifier_dt.fit(X_train, y_train)

# # To evaluate the model using the accuracy_prediction function
accuracy_dt = accuracy_prediction(classifier_dt, X_test, y_test)
print(f"Random Forest Classifier Accuracy: {accuracy_dt}")

  return fit_method(estimator, *args, **kwargs)


Random Forest Classifier Accuracy: 0.9777777777777777


In [18]:
y_pred = classifier_dt.predict(X_test)

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have the true labels (y_test_cls) and predicted labels (y_pred_cls) for your classification task

# Calculate Precision
precision = precision_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"F1-score: {f1}")

Precision: 0.9796296296296295
Recall: 0.9777777777777777
F1-score: 0.9779434092477569


In [20]:
# Assuming you have classification data in indep_X_cls and dep_Y_cls
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)

from sklearn.svm import SVC
classifier_svm = SVC(kernel='linear', random_state=0) # Common kernels are 'linear', 'rbf', 'poly', 'sigmoid'
classifier_svm.fit(X_train, y_train)

# # To evaluate the model using the accuracy_prediction function
accuracy_svm = accuracy_prediction(classifier_svm, X_test, y_test)
print(f"SVM Classifier Accuracy: {accuracy_svm}")

SVM Classifier Accuracy: 0.9777777777777777


  y = column_or_1d(y, warn=True)


In [21]:
y_pred = classifier_dt.predict(X_test)

In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have the true labels (y_test_cls) and predicted labels (y_pred_cls) for your classification task

# Calculate Precision
precision = precision_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"F1-score: {f1}")

Precision: 0.9796296296296295
Recall: 0.9777777777777777
F1-score: 0.9779434092477569


In [23]:
# Assuming you have classification data in indep_X_cls and dep_Y_cls
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)

from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors=5) # Removed kernel='linear' and random_state as they are not valid parameters for KNeighborsClassifier
classifier_knn.fit(X_train, y_train)

# # To evaluate the model using the accuracy_prediction function
accuracy_knn = accuracy_prediction(classifier_knn, X_test, y_test)
print(f"KNN Classifier Accuracy: {accuracy_knn}")

KNN Classifier Accuracy: 0.9777777777777777


  return self._fit(X, y)


In [24]:
y_pred = classifier_dt.predict(X_test)

In [25]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have the true labels (y_test_cls) and predicted labels (y_pred_cls) for your classification task

# Calculate Precision
precision = precision_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"F1-score: {f1}")

Precision: 0.9796296296296295
Recall: 0.9777777777777777
F1-score: 0.9779434092477569


In [26]:
# Assuming you have classification data in indep_X_cls and dep_Y_cls
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.30, random_state=0)

from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(random_state=0) # Corrected parameters for LogisticRegression
classifier_lr.fit(X_train, y_train)

# # To evaluate the model using the accuracy_prediction function
accuracy_lr = accuracy_prediction(classifier_lr, X_test, y_test)
print(f"Logistic Regression Accuracy: {accuracy_lr}")

  y = column_or_1d(y, warn=True)


Logistic Regression Accuracy: 0.9777777777777777


In [27]:
y_pred = classifier_dt.predict(X_test)

In [28]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have the true labels (y_test_cls) and predicted labels (y_pred_cls) for your classification task

# Calculate Precision
precision = precision_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted') # Use 'weighted' for multiclass or if you have class imbalance
print(f"F1-score: {f1}")

Precision: 0.9796296296296295
Recall: 0.9777777777777777
F1-score: 0.9779434092477569


In [29]:
import pickle
Finalised_Model="Finalized_model.sav"

In [31]:
pickle.dump(classifier_dt,open(Finalised_Model,'wb'))