In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from pathlib import Path 
import pandas as pd
output_dir = Path("../output/penguins")
output_dir.mkdir(parents=True, exist_ok=True)
file_path = "penguins.csv"

df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "larsen0966/penguins",
  file_path,
)

df.head()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [2]:
cat_cols = ["island", "sex","year"]
num_cols = ["bill_depth_mm","bill_length_mm", "flipper_length_mm", "body_mass_g"]
index_col = "Unnamed: 0"
df.drop(columns=[index_col], inplace=True)
target_col = "species"

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

model = LogisticRegression(max_iter=4000)
X = df.drop(columns=[target_col])
y = df[target_col]

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_cat = X[cat_cols]
X_num = X[num_cols]
imputer = SimpleImputer(strategy='mean')
X_num_imputed = pd.DataFrame(imputer.fit_transform(X_num), columns=num_cols)
scaler = StandardScaler()
X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num_imputed), columns=num_cols)
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat_encoded = pd.DataFrame(encoder.fit_transform(X_cat), columns=encoder.get_feature_names_out(cat_cols))
X_encoded = pd.concat([X_num_scaled, X_cat_encoded], axis=1)
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1]
})
metrics_df.to_csv(output_dir / "validation_metrics.csv", index=False)
metrics_df



Unnamed: 0,Metric,Value
0,Accuracy,1.0
1,Precision,1.0
2,Recall,1.0
3,F1 Score,1.0




Unnamed: 0,Parameter,Value
0,C,1
1,solver,liblinear


In [5]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(df, title="Penguins Dataset Profiling Report")
# profile.to_file(str(output_dir / "penguins_profile_report.html"))