In [1]:
import numpy as np
import math

In [2]:
def leave_one_out_cross_validation(data, current_features, feature_to_add, choice):
    num_correct_classified = 0

    temp_features = current_features + [feature_to_add] if choice == 1 else [f for f in current_features if f != feature_to_add]

    data_array = np.array(data)
    num_instances = len(data_array)
    features_array = data_array[:, temp_features]
    labels_array = data_array[:, 0]

    for x in range(num_instances):
        testing_features = features_array[x]
        testing_label = labels_array[x]
        training_features = np.delete(features_array, x, axis=0)
        training_labels = np.delete(labels_array, x)

        distances = np.sqrt(np.sum(np.square(training_features - testing_features), axis=1))
        nearest_neighbor_index = np.argmin(distances)
        nearest_neighbor_label = training_labels[nearest_neighbor_index]

        if testing_label == nearest_neighbor_label:
            num_correct_classified += 1

    accuracy = num_correct_classified / num_instances
    return accuracy


In [3]:
def forward_selection(data, choice):
    selected_features, best_features = [], []
    num_features, best_accuracy = len(data[0]), 0

    for x in range(1, num_features):
        feature_to_add = None
        best_so_far_accuracy = 0

        for feature in range(1, num_features):
            if feature not in selected_features:
                accuracy = leave_one_out_cross_validation(data, selected_features, feature, choice)
                new_features = selected_features + [feature]
                accuracy_str = f"{accuracy:.4f}"
                print(f"Using feature(s) {new_features}, accuracy is {accuracy_str}")

                if accuracy > best_so_far_accuracy:
                    best_so_far_accuracy = accuracy
                    feature_to_add = feature

        if feature_to_add is not None:
            selected_features.append(feature_to_add)
            accuracy_str = f"{best_so_far_accuracy:.4f}"
            print(f"\nOn level {x}, added feature {feature_to_add} to selected set, accuracy is {accuracy_str}")
            print(f"Selected set: {selected_features}\n")

        if best_so_far_accuracy > best_accuracy:
            best_features = selected_features[:]
            best_accuracy = best_so_far_accuracy
    accuracy_str = f"{best_accuracy:.4f}"
    print(f"Finished search!! The best feature subset is {best_features}, which has an accuracy of {accuracy_str}")

In [4]:
def backward_elimination(data, choice):
    current_features, best_features = list(range(1, len(data[0]))), []
    num_features, best_accuracy = len(data[0]), 0

    for x in range(1, num_features):
        feature_to_remove = None
        best_so_far_accuracy = 0

        for feature in current_features:
            if feature in current_features:
                accuracy = leave_one_out_cross_validation(data, current_features, feature, choice)
                if accuracy > best_so_far_accuracy:
                    best_so_far_accuracy = accuracy
                    feature_to_remove = feature

        if feature_to_remove is not None:
            current_features.remove(feature_to_remove)
            if len(current_features)>0:
              accuracy_str = f"{best_so_far_accuracy:.4f}"
              print(f"\nOn level {x}, removed feature {feature_to_remove} from current set, accuracy is {accuracy_str}")
              print(f"Feature set {current_features} was best, accuracy is {accuracy_str} \n")

        if best_so_far_accuracy > best_accuracy:
            best_features = current_features[:]
            best_accuracy = best_so_far_accuracy
    accuracy_str = f"{best_accuracy:.4f}"
    print(f"Finished search!! The best feature subset is {best_features}, which has an accuracy of {accuracy_str}")

Forward Selection for small dataset

In [6]:
print("Welcome to the Feature Selection Algorithm")

data = np.loadtxt("CS170_small_Data__22.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
1

This dataset has 10 features (not including class attribute), with 1000 instances.

Using feature(s) [1], accuracy is 0.6750
Using feature(s) [2], accuracy is 0.6790
Using feature(s) [3], accuracy is 0.6820
Using feature(s) [4], accuracy is 0.6820
Using feature(s) [5], accuracy is 0.6880
Using feature(s) [6], accuracy is 0.8400
Using feature(s) [7], accuracy is 0.7450
Using feature(s) [8], accuracy is 0.6940
Using feature(s) [9], accuracy is 0.6930
Using feature(s) [10], accuracy is 0.6880

On level 1, added feature 6 to selected set, accuracy is 0.8400
Selected set: [6]

Using feature(s) [6, 1], accuracy is 0.8450
Using feature(s) [6, 2], accuracy is 0.8280
Using feature(s) [6, 3], accuracy is 0.8300
Using feature(s) [6, 4], accuracy is 0.8220
Using feature(s) [6, 5], accuracy is 0.8540
Using feature(s) [6, 7], accuracy is 0.9820
Using feature(s) [6, 8], a

Backward Elimination for small dataset

In [23]:
print("Welcome to the Feature Selection Algorithm")

data = np.loadtxt("CS170_small_Data__22.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
2

This dataset has 10 features (not including class attribute), with 1000 instances.


On level 1, removed feature 3 from current set, accuracy is 0.7800
Feature set [1, 2, 4, 5, 6, 7, 8, 9, 10] was best, accuracy is 0.7800 


On level 2, removed feature 2 from current set, accuracy is 0.7960
Feature set [1, 4, 5, 6, 7, 8, 9, 10] was best, accuracy is 0.7960 


On level 3, removed feature 1 from current set, accuracy is 0.8330
Feature set [4, 5, 6, 7, 8, 9, 10] was best, accuracy is 0.8330 


On level 4, removed feature 8 from current set, accuracy is 0.8570
Feature set [4, 5, 6, 7, 9, 10] was best, accuracy is 0.8570 


On level 5, removed feature 5 from current set, accuracy is 0.8840
Feature set [4, 6, 7, 9, 10] was best, accuracy is 0.8840 


On level 6, removed feature 10 from current set, accuracy is 0.9140
Feature set [4, 6, 7, 9] was best, accuracy is

Forward Selection for large data

In [14]:
print("Welcome to the Feature Selection Algorithm")

data = np.loadtxt("CS170_large_Data__31.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
1

This dataset has 20 features (not including class attribute), with 2000 instances.

Using feature(s) [1], accuracy is 0.7005
Using feature(s) [2], accuracy is 0.6910
Using feature(s) [3], accuracy is 0.6885
Using feature(s) [4], accuracy is 0.6895
Using feature(s) [5], accuracy is 0.7155
Using feature(s) [6], accuracy is 0.6910
Using feature(s) [7], accuracy is 0.7105
Using feature(s) [8], accuracy is 0.6985
Using feature(s) [9], accuracy is 0.6965
Using feature(s) [10], accuracy is 0.7205
Using feature(s) [11], accuracy is 0.6980
Using feature(s) [12], accuracy is 0.7025
Using feature(s) [13], accuracy is 0.7055
Using feature(s) [14], accuracy is 0.7020
Using feature(s) [15], accuracy is 0.6930
Using feature(s) [16], accuracy is 0.7110
Using feature(s) [17], accuracy is 0.6985
Using feature(s) [18], accuracy is 0.7185
Using feature(s) [19], accuracy is 0.6

Backward Elimination for large dataset

In [24]:
print("Welcome to the Feature Selection Algorithm")

data = np.loadtxt("CS170_large_Data__31.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
2

This dataset has 20 features (not including class attribute), with 2000 instances.


On level 1, removed feature 14 from current set, accuracy is 0.7305
Feature set [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20] was best, accuracy is 0.7305 


On level 2, removed feature 6 from current set, accuracy is 0.7380
Feature set [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20] was best, accuracy is 0.7380 


On level 3, removed feature 7 from current set, accuracy is 0.7455
Feature set [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20] was best, accuracy is 0.7455 


On level 4, removed feature 3 from current set, accuracy is 0.7525
Feature set [1, 2, 4, 5, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20] was best, accuracy is 0.7525 


On level 5, removed feature 2 from current set, accuracy is 0.7640
Feature set [1, 4,

Forward Selection for XXXlarge data

In [59]:
print("Welcome to the Feature Selection Algorithm")

data = np.genfromtxt("CS170_XXXlarge_Data__12.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination

This dataset has 80 features (not including class attribute), with 4000 instances.

Using feature(s) [1], accuracy is 0.701
Using feature(s) [2], accuracy is 0.703
Using feature(s) [3], accuracy is 0.6955
Using feature(s) [4], accuracy is 0.7165
Using feature(s) [5], accuracy is 0.7055
Using feature(s) [6], accuracy is 0.6985
Using feature(s) [7], accuracy is 0.6955
Using feature(s) [8], accuracy is 0.709
Using feature(s) [9], accuracy is 0.71275
Using feature(s) [10], accuracy is 0.8445
Using feature(s) [11], accuracy is 0.70325
Using feature(s) [12], accuracy is 0.70725
Using feature(s) [13], accuracy is 0.70625
Using feature(s) [14], accuracy is 0.7095
Using feature(s) [15], accuracy is 0.7
Using feature(s) [16], accuracy is 0.698
Using feature(s) [17], accuracy is 0.7095
Using feature(s) [18], accuracy is 0.70625
Using feature(s) [19], accuracy is 0.704
U

In [None]:
print("Welcome to the Feature Selection Algorithm")

data = np.genfromtxt("CS170_XXXlarge_Data__12.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

In [8]:
#adding early stopping for xlarge dataset
def forward_selection(data, choice, early_stopping_threshold=0.01):
    selected_features, best_features = [], []
    num_features, best_accuracy = len(data[0]), 0

    for x in range(1, num_features):
        feature_to_add = None
        best_so_far_accuracy = 0

        for feature in range(1, num_features):
            if feature not in selected_features:
                accuracy = leave_one_out_cross_validation(data, selected_features, feature, choice)
                new_features = selected_features + [feature]
                accuracy_str = f"{accuracy:.4f}"
                print(f"Using feature(s) {new_features}, accuracy is {accuracy_str}")
                if accuracy > best_so_far_accuracy:
                    best_so_far_accuracy = accuracy
                    feature_to_add = feature

        if feature_to_add is not None:
            selected_features.append(feature_to_add)
            accuracy_str = f"{best_so_far_accuracy:.4f}"
            print(f"\nOn level {x}, added feature {feature_to_add} to selected set, accuracy is {accuracy_str}")
            print(f"Selected set: {selected_features}\n")


        if best_so_far_accuracy > best_accuracy:
            best_features = selected_features[:]
            best_accuracy = best_so_far_accuracy
        elif best_accuracy - best_so_far_accuracy > early_stopping_threshold:
            print(f"Stopping early at level {x} due to lack of improvement in accuracy")
            break
    accuracy_str = f"{best_accuracy:.4f}"
    print(f"Finished search!! The best feature subset is {best_features}, which has an accuracy of {accuracy_str}")
    #print(f"Finished search!! The best feature subset is {best_features}, which has an accuracy of {best_accuracy}")


In [9]:
def backward_elimination(data, choice, early_stopping_threshold=0.01):
    current_features = list(range(1, len(data[0])))
    num_features = len(data[0])
    best_features = current_features[:]
    best_accuracy = leave_one_out_cross_validation(data, current_features, None, choice)

    for x in range(1, num_features):
        feature_to_remove = None
        best_so_far_accuracy = 0

        for feature in current_features:
            accuracy = leave_one_out_cross_validation(data, current_features, feature, choice)
            if accuracy > best_so_far_accuracy:
                best_so_far_accuracy = accuracy
                feature_to_remove = feature

        if feature_to_remove is not None:
            current_features.remove(feature_to_remove)
            accuracy_str = f"{best_so_far_accuracy:.4f}"
            print(f"\nOn level {x}, removed feature {feature_to_remove} from current set, accuracy is {accuracy_str}")
            print(f"Feature set {current_features} was best, accuracy is {accuracy_str} \n")

        if best_so_far_accuracy > best_accuracy:
            best_features = current_features[:]
            best_accuracy = best_so_far_accuracy
        elif best_accuracy - best_so_far_accuracy > early_stopping_threshold:
            print(f"Stopping early at level {x} due to lack of improvement in accuracy")
            break
    accuracy_str = f"{best_accuracy:.4f}"
    print(f"Finished search!! The best feature subset is {best_features}, which has an accuracy of {accuracy_str}")
    #print(f"Finished search!! The best feature subset is {best_features}, which has an accuracy of {best_accuracy}")


Forward Selection for XXXlarge data with early stopping

In [18]:
early_stopping_threshold=0.01
print("Welcome to the Feature Selection Algorithm")

data = np.genfromtxt("CS170_XXXlarge_Data__12.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice,early_stopping_threshold)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice,early_stopping_threshold)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
1

This dataset has 80 features (not including class attribute), with 4000 instances.

Using feature(s) [1], accuracy is 0.7010
Using feature(s) [2], accuracy is 0.7030
Using feature(s) [3], accuracy is 0.6955
Using feature(s) [4], accuracy is 0.7165
Using feature(s) [5], accuracy is 0.7055
Using feature(s) [6], accuracy is 0.6985
Using feature(s) [7], accuracy is 0.6955
Using feature(s) [8], accuracy is 0.7090
Using feature(s) [9], accuracy is 0.7127
Using feature(s) [10], accuracy is 0.8445
Using feature(s) [11], accuracy is 0.7033
Using feature(s) [12], accuracy is 0.7073
Using feature(s) [13], accuracy is 0.7063
Using feature(s) [14], accuracy is 0.7095
Using feature(s) [15], accuracy is 0.7000
Using feature(s) [16], accuracy is 0.6980
Using feature(s) [17], accuracy is 0.7095
Using feature(s) [18], accuracy is 0.7063
Using feature(s) [19], accuracy is 0.7

Backward Elimination for XXXlarge data with early stopping

In [11]:
early_stopping_threshold=0.01
print("Welcome to the Feature Selection Algorithm")

data = np.genfromtxt("CS170_XXXlarge_Data__12.txt", dtype=float)

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice,early_stopping_threshold)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice,early_stopping_threshold)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination

This dataset has 80 features (not including class attribute), with 4000 instances.


On level 1, removed feature 10 from current set, accuracy is 0.7007
Feature set [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80] was best, accuracy is 0.7007 


On level 2, removed feature 44 from current set, accuracy is 0.7047
Feature set [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76

Working on Real world data

In [25]:
import pandas as pd
data = pd.read_csv('yeast.csv')

In [26]:
data

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,name
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


Encoding the class from categorical to numerical


In [27]:
from sklearn.preprocessing import LabelEncoder
name_encoder=LabelEncoder()
data["name"]=name_encoder.fit_transform(data["name"].values)

Had to do this, because we have made the code in that format that the first column is the output class and rest are the features.

In [28]:
last_col = data.pop('name')
data.insert(0, 'name', last_col)

In [29]:
data

Unnamed: 0,name,mcg,gvh,alm,mit,erl,pox,vac,nuc
0,6,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22
1,6,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22
2,6,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22
3,7,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22
4,6,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22
...,...,...,...,...,...,...,...,...,...
1479,4,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22
1480,7,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47
1481,4,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22
1482,7,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39


Z-normalization

In [30]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_data = scaler.fit_transform(data)

In [31]:
print("Welcome to the Feature Selection Algorithm")

data = normalized_data

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
1

This dataset has 8 features (not including class attribute), with 1484 instances.

Using feature(s) [1], accuracy is 0.2689
Using feature(s) [2], accuracy is 0.2729
Using feature(s) [3], accuracy is 0.3248
Using feature(s) [4], accuracy is 0.2473
Using feature(s) [5], accuracy is 0.1678
Using feature(s) [6], accuracy is 0.1631
Using feature(s) [7], accuracy is 0.2547
Using feature(s) [8], accuracy is 0.2749

On level 1, added feature 3 to selected set, accuracy is 0.3248
Selected set: [3]

Using feature(s) [3, 1], accuracy is 0.3888
Using feature(s) [3, 2], accuracy is 0.3753
Using feature(s) [3, 4], accuracy is 0.3976
Using feature(s) [3, 5], accuracy is 0.3248
Using feature(s) [3, 6], accuracy is 0.3086
Using feature(s) [3, 7], accuracy is 0.3275
Using feature(s) [3, 8], accuracy is 0.3733

On level 2, added feature 4 to selected set, accuracy is 0.3976
S

In [33]:
data1=pd.read_csv('rice.csv',encoding='')
data1

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896,Cammeo
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436,Cammeo
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259,Cammeo
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669,Cammeo
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024,Cammeo
...,...,...,...,...,...,...,...,...
3805,11441,415.858002,170.486771,85.756592,0.864280,11628,0.681012,Osmancik
3806,11625,421.390015,167.714798,89.462570,0.845850,11904,0.694279,Osmancik
3807,12437,442.498993,183.572922,86.801979,0.881144,12645,0.626739,Osmancik
3808,9882,392.296997,161.193985,78.210480,0.874406,10097,0.659064,Osmancik


In [34]:
from sklearn.preprocessing import LabelEncoder
name_encoder=LabelEncoder()
data1["Class"]=name_encoder.fit_transform(data1["Class"].values)

In [35]:
data1

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896,0
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436,0
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259,0
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669,0
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024,0
...,...,...,...,...,...,...,...,...
3805,11441,415.858002,170.486771,85.756592,0.864280,11628,0.681012,1
3806,11625,421.390015,167.714798,89.462570,0.845850,11904,0.694279,1
3807,12437,442.498993,183.572922,86.801979,0.881144,12645,0.626739,1
3808,9882,392.296997,161.193985,78.210480,0.874406,10097,0.659064,1


In [36]:
last_col = data1.pop('Class')
data1.insert(0, 'Class', last_col)

In [37]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_data1 = scaler.fit_transform(data1)

In [38]:
print("Welcome to the Feature Selection Algorithm")

data = normalized_data1

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
1

This dataset has 7 features (not including class attribute), with 3810 instances.

Using feature(s) [1], accuracy is 0.7992
Using feature(s) [2], accuracy is 0.8798
Using feature(s) [3], accuracy is 0.8866
Using feature(s) [4], accuracy is 0.5827
Using feature(s) [5], accuracy is 0.6903
Using feature(s) [6], accuracy is 0.8226
Using feature(s) [7], accuracy is 0.5591

On level 1, added feature 3 to selected set, accuracy is 0.8866
Selected set: [3]

Using feature(s) [3, 1], accuracy is 0.8785
Using feature(s) [3, 2], accuracy is 0.8919
Using feature(s) [3, 4], accuracy is 0.8900
Using feature(s) [3, 5], accuracy is 0.8848
Using feature(s) [3, 6], accuracy is 0.8848
Using feature(s) [3, 7], accuracy is 0.8890

On level 2, added feature 2 to selected set, accuracy is 0.8919
Selected set: [3, 2]

Using feature(s) [3, 2, 1], accuracy is 0.8932
Using feature(s) 

In [39]:
print("Welcome to the Feature Selection Algorithm")

data = normalized_data1

print("Please choose an algorithm by entering:")
print("1) Forward Selection")
print("2) Backward Elimination")

choice = int(input())

if choice == 1:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    forward_selection(data, choice)
elif choice == 2:
    print(f"\nThis dataset has {len(data[0]) - 1} features (not including class attribute), with {len(data)} instances.\n")
    backward_elimination(data, choice)
else:
    print("Invalid choice. Please choose either 1 or 2!")

Welcome to the Feature Selection Algorithm
Please choose an algorithm by entering:
1) Forward Selection
2) Backward Elimination
2

This dataset has 7 features (not including class attribute), with 3810 instances.


On level 1, removed feature 7 from current set, accuracy is 0.8953
Feature set [1, 2, 3, 4, 5, 6] was best, accuracy is 0.8953 


On level 2, removed feature 5 from current set, accuracy is 0.8990
Feature set [1, 2, 3, 4, 6] was best, accuracy is 0.8990 


On level 3, removed feature 6 from current set, accuracy is 0.8969
Feature set [1, 2, 3, 4] was best, accuracy is 0.8969 


On level 4, removed feature 3 from current set, accuracy is 0.8955
Feature set [1, 2, 4] was best, accuracy is 0.8955 


On level 5, removed feature 1 from current set, accuracy is 0.8853
Feature set [2, 4] was best, accuracy is 0.8853 


On level 6, removed feature 4 from current set, accuracy is 0.8798
Feature set [2] was best, accuracy is 0.8798 

Finished search!! The best feature subset is [1, 2,