In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load the Iris dataset
url = "Iris.csv"
data = pd.read_csv(url)

# Display the first few rows of the dataset
print("Original Dataset:")
print(data.head())

# Drop irrelevant features based on business logic/common sense
# For simplicity, let's keep only 'SepalLengthCm', 'SepalWidthCm', and 'PetalLengthCm'
selected_features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'Species']
data = data[selected_features]

# Display the modified dataset
print("\nModified Dataset:")
print(data.head())

# Preprocess the data (Label encoding for the target variable)
le = LabelEncoder()
data['Species'] = le.fit_transform(data['Species'])

# Split the data into features (X) and target variable (y)
X = data.drop('Species', axis=1)
y = data['Species']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply feature selection using three different methods
# Method 1: Univariate feature selection using SelectKBest with ANOVA F-statistic
selector1 = SelectKBest(f_classif, k=2)
X_train_kbest = selector1.fit_transform(X_train, y_train)
X_test_kbest = selector1.transform(X_test)

# Method 2: Recursive Feature Elimination (RFE) using RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
selector2 = RFE(rf_classifier, n_features_to_select=2)
X_train_rfe = selector2.fit_transform(X_train, y_train)
X_test_rfe = selector2.transform(X_test)

# Method 3: Feature Importance from RandomForestClassifier
rf_classifier.fit(X_train, y_train)
feature_importances = pd.Series(rf_classifier.feature_importances_, index=X_train.columns).sort_values(ascending=False)
selected_features_rf = feature_importances.index[:2]
X_train_rf = X_train[selected_features_rf]
X_test_rf = X_test[selected_features_rf]

# Display selected features using each method
print("\nSelected Features (SelectKBest):", X_train.columns[selector1.get_support()])
print("Selected Features (RFE):", X_train.columns[selector2.support_])
print("Selected Features (RandomForest):", selected_features_rf)


Original Dataset:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm FlowerColour  \
0   1            5.1           3.5            1.4           0.2       Purple   
1   2            4.9           3.0            1.4           0.2       Orange   
2   3            4.7           3.2            1.3           0.2        Black   
3   4            4.6           3.1            1.5           0.2        White   
4   5            5.0           3.6            1.4           0.2         Teal   

   YearCollected  MonthCollected  StigmaLegnth      Species  
0           2003               2             2  Iris-setosa  
1           1998               9             1  Iris-setosa  
2           1995               5             3  Iris-setosa  
3           2008               3             3  Iris-setosa  
4           2007               9             1  Iris-setosa  

Modified Dataset:
   SepalLengthCm  SepalWidthCm  PetalLengthCm      Species
0            5.1           3.5            1.4  Iris