In [26]:
import numpy as np
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_regression, RFE, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

**Feature selection for penguin classification**

In [27]:
# Connect to the data base
conn = sqlite3.connect("penguins.db")

# Read the data from the penguins table
df = pd.read_sql("SELECT * FROM penguins", conn)

# Close the connection
conn.close()

print(df.head())


   animal_id species  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0          1  Adelie            39.1           18.7              181.0   
1          2  Adelie            39.5           17.4              186.0   
2          3  Adelie            40.3           18.0              195.0   
3          4  Adelie            36.7           19.3              193.0   
4          5  Adelie            39.3           20.6              190.0   

   body_mass_g     sex  island_id  
0       3750.0    Male          1  
1       3800.0  Female          1  
2       3250.0  Female          1  
3       3450.0  Female          1  
4       3650.0    Male          1  


In [28]:
# Remove irrelevant columns 
# 'animal_id' is an unique indicator, 'sex' can be relevant but not for now
df = df.drop(columns=["animal_id", "sex"]) 

# Split features and target variable
X = df.drop(columns=["species"])  # Features
y = df["species"].astype("category").cat.codes  # Convert categorical target to numerical

#Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #test_size 0,2 = 20% of the data is testdata, random_state=42, making sure we get the same split every time we run the code, so the results are even.

#Normalize the features
scaler = StandardScaler() #so the average is 0 and the std is 1, this is important because some of the features has a lot bigger value than the others
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Filter method (mutual information)**

In [29]:
selector = SelectKBest(score_func=mutual_info_classif, k='all')  # Evaluate all features
# 'SelectKBest' --> function that picks K best features, k=all, because we evaluate all but doesnt choose yet
selector.fit(X_train_scaled, y_train) # the models learns how much information each feature gives about target variable

# High MI-score --> The feature helps alot to predict the speices 
# Low MI-score --> the features doesnt almost help with the prediction

# Mutual Information Scores
mi_scores = pd.Series(selector.scores_, index=X.columns) #saves the score for each feature in a table
# selector.scores_ --> list over MI-scores for each feature
# index=X.columns --> names all the rows with feature names 

mi_scores.sort_values(ascending=False, inplace=True) #sort features from most important to less important
# ascending=False --> sort from high to low
#inplace=True --> savnes the changes in mi_scores

print("Mutual Information Scores (Filter Method):")
print(mi_scores)

Mutual Information Scores (Filter Method):
bill_length_mm       0.743185
flipper_length_mm    0.657526
bill_depth_mm        0.617839
body_mass_g          0.597146
island_id            0.530223
dtype: float64


**EMBEDDED METHOD (Random Forest Feature Importance)**

In [30]:
model = RandomForestClassifier(n_estimators=100, random_state=42) #Create a Random Forest-model with 100 trees (n_estimators=100)
model.fit(X_train_scaled, y_train)

# Feature importance fra Random Forest
feature_importance = pd.Series(model.feature_importances_, index=X.columns) #Makes a list about the most important features according Random Forest
feature_importance.sort_values(ascending=False, inplace=True) #Makes a table where each feature gets a score

print("Random Forest Feature Importance (Embedded Method):")
print(feature_importance)

Random Forest Feature Importance (Embedded Method):
bill_length_mm       0.347782
flipper_length_mm    0.261785
bill_depth_mm        0.176183
island_id            0.145641
body_mass_g          0.068610
dtype: float64


**PERMUTATION IMPORTANCE**

In [31]:
#Tests how important each feature is for the models precision
#The model shuffles one feature at a time to see how much the models accuracy falls
# If a feature isnt important, the shuffle will not affect the models performance
# If a feature is important, the accuracy will be worse
perm_importance = permutation_importance(model, X_test_scaled, y_test, scoring="accuracy") 

# Feature importance fra Permutation Importance
perm_scores = pd.Series(perm_importance.importances_mean, index=X.columns)
perm_scores.sort_values(ascending=False, inplace=True)

print("Permutation Importance Scores:")
print(perm_scores)

Permutation Importance Scores:
bill_length_mm       0.157303
island_id            0.117603
bill_depth_mm        0.083146
flipper_length_mm    0.070412
body_mass_g          0.004494
dtype: float64


**WRAPPER METHOD (RFE - Recursive Feature Elimination)**

In [32]:
rfe_model = LogisticRegression() #Used to evaluate the importance of the feature
rfe = RFE(estimator=rfe_model, n_features_to_select=3)  # Picking top 3 features
rfe.fit(X_train_scaled, y_train)

# Få RFE feature ranking og sorter de bedste features
rfe_features = X.columns[rfe.support_].tolist()
print("RFE Features:", rfe_features)


RFE Features: ['bill_length_mm', 'bill_depth_mm', 'body_mass_g']


In [33]:
# Create a list over all features
features = X.columns.tolist()

# A function that marks if a feature is in top 3 for each method 
def check_top_features(feature, method_top_features):
    return "✅" if feature in method_top_features else "❌"

# Build a dynamic table 
feature_table = pd.DataFrame({
    "Feature": features,
    "Mutual Info": [check_top_features(f, mi_scores.head(3).index.tolist()) for f in features],
    "RFE": [check_top_features(f, rfe_features) for f in features],
    "Random Forest": [check_top_features(f, feature_importance.head(3).index.tolist()) for f in features],
    "Permutation": [check_top_features(f, perm_scores.head(3).index.tolist()) for f in features]
})

# Decide whether to keep or remove a feature (If it is important in at least 2 methods, keep it)
feature_table[" Keep/Remove?"] = feature_table[["Mutual Info", "RFE", "Random Forest", "Permutation"]].apply(
    lambda row: "Keep ✅" if list(row).count("✅") >= 2 else "Remove ❌", axis=1
)

# Print the table
print("Feature Selection Results")
print(feature_table.to_string(index=False))


Feature Selection Results
          Feature Mutual Info RFE Random Forest Permutation  Keep/Remove?
   bill_length_mm           ✅   ✅             ✅           ✅        Keep ✅
    bill_depth_mm           ✅   ✅             ✅           ✅        Keep ✅
flipper_length_mm           ✅   ❌             ✅           ❌        Keep ✅
      body_mass_g           ❌   ✅             ❌           ❌      Remove ❌
        island_id           ❌   ❌             ❌           ✅      Remove ❌


In [34]:
# We remove body_mass_g and island_id since it is only important in one method

final_features = ["bill_length_mm", "flipper_length_mm", "bill_depth_mm"]
print("Final features")
print(final_features)


Final features
['bill_length_mm', 'flipper_length_mm', 'bill_depth_mm']


**Train the model**

In [None]:
# Only using the final features 
X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

# Train on Logistic Regression model, because we have classification problem
model = LogisticRegression()
model.fit(X_train_final, y_train)

# Predict on the test data
y_pred = model.predict(X_test_final)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.2f}")


Final Model Accuracy: 1.00


In [36]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X_train_final, y_train, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")


Cross-Validation Accuracy: 0.99 ± 0.00


In [37]:
from joblib import dump

# Save the model
dump(model, "penguin_classifier.joblib")
print("Model saved as 'penguin_classifier.joblib'")


Model saved as 'penguin_classifier.joblib'
