### Imports

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [45]:
columns = ["island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "sex", "species"]
penguins_df = pd.read_csv("data/penguins.csv", names=columns)
tmp = pd.read_csv("data/penguins.csv", names=columns)

penguins_df info:  
• island: island name (Dream, Torgersen, or Biscoe) in the Palmer Archipelago (Antarctica)  
• bill _length_mm: bill length (mm)  
• bill_depth_mm: bill depth (mm)  
• flipper_length_mm: flipper length (mm)  
• body_mass_g: body mass (g)  
• sex: penguin sex  
• species: penguin species  

In [46]:
penguins_df

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species
0,Torgersen,39.1,18.7,181.0,3750.0,male,Adelie
1,Torgersen,39.5,17.4,186.0,3800.0,female,Adelie
2,Torgersen,40.3,18.0,195.0,3250.0,female,Adelie
3,Torgersen,,,,,,Adelie
4,Torgersen,36.7,19.3,193.0,3450.0,female,Adelie
...,...,...,...,...,...,...,...
339,Dream,55.8,19.8,207.0,4000.0,male,Chinstrap
340,Dream,43.5,18.1,202.0,3400.0,female,Chinstrap
341,Dream,49.6,18.2,193.0,3775.0,male,Chinstrap
342,Dream,50.8,19.0,210.0,4100.0,male,Chinstrap


In [47]:
penguins_df.shape

(344, 7)

### Removing Nas

In [48]:
penguins_df.isna().sum()

island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
species               0
dtype: int64

In [49]:
before = len(penguins_df)

clean_df = penguins_df.dropna()

after = len(clean_df)

print(f"Rows before dropna: {before}")
print(f"Rows after dropna: {after}")


Rows before dropna: 344
Rows after dropna: 333


In [50]:
clean_df.isna().sum()

island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
species              0
dtype: int64

### Filling Nas

In [51]:
from sklearn.impute import SimpleImputer


num_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
num_imputer = SimpleImputer(strategy='mean')
penguins_df[num_cols] = num_imputer.fit_transform(penguins_df[num_cols])


cat_cols = ['sex', 'island', 'species']
cat_imputer = SimpleImputer(strategy='most_frequent')
penguins_df[cat_cols] = cat_imputer.fit_transform(penguins_df[cat_cols])


In [40]:
penguins_df.isna().sum()

island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
species              0
dtype: int64

In [44]:
na_rows = tmp[tmp.isna().any(axis=1)]
na_rows

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species
3,Torgersen,,,,,,Adelie
8,Torgersen,34.1,18.1,193.0,3475.0,,Adelie
9,Torgersen,42.0,20.2,190.0,4250.0,,Adelie
10,Torgersen,37.8,17.1,186.0,3300.0,,Adelie
11,Torgersen,37.8,17.3,180.0,3700.0,,Adelie
47,Dream,37.5,18.9,179.0,2975.0,,Adelie
178,Biscoe,44.5,14.3,216.0,4100.0,,Gentoo
218,Biscoe,46.2,14.4,214.0,4650.0,,Gentoo
256,Biscoe,47.3,13.8,216.0,4725.0,,Gentoo
268,Biscoe,44.5,15.7,217.0,4875.0,,Gentoo


In [53]:
penguins_df.iloc[na_rows.index]
# all filled

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species
3,Torgersen,43.92193,17.15117,200.915205,4201.754386,male,Adelie
8,Torgersen,34.1,18.1,193.0,3475.0,male,Adelie
9,Torgersen,42.0,20.2,190.0,4250.0,male,Adelie
10,Torgersen,37.8,17.1,186.0,3300.0,male,Adelie
11,Torgersen,37.8,17.3,180.0,3700.0,male,Adelie
47,Dream,37.5,18.9,179.0,2975.0,male,Adelie
178,Biscoe,44.5,14.3,216.0,4100.0,male,Gentoo
218,Biscoe,46.2,14.4,214.0,4650.0,male,Gentoo
256,Biscoe,47.3,13.8,216.0,4725.0,male,Gentoo
268,Biscoe,44.5,15.7,217.0,4875.0,male,Gentoo


### Encoding

In [57]:
from sklearn.preprocessing import LabelEncoder

encodes = penguins_df.copy()

# im doing some zip, love the approach XD
island_encoder = LabelEncoder()
encodes['island'] = island_encoder.fit_transform(df['island'])
print("Island mapping:", dict(zip(island_encoder.classes_, island_encoder.transform(island_encoder.classes_))))


sex_encoder = LabelEncoder()
encodes['sex'] = sex_encoder.fit_transform(encodes['sex'])
print("Sex mapping:", dict(zip(sex_encoder.classes_, sex_encoder.transform(sex_encoder.classes_))))


species_encoder = LabelEncoder()
encodes['species'] = species_encoder.fit_transform(encodes['species'])
print("Species mapping:", dict(zip(species_encoder.classes_, species_encoder.transform(species_encoder.classes_))))


Island mapping: {np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2)}
Sex mapping: {'female': np.int64(0), 'male': np.int64(1)}
Species mapping: {'Adelie': np.int64(0), 'Chinstrap': np.int64(1), 'Gentoo': np.int64(2)}


In [None]:
encodes.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species
0,2,39.1,18.7,181.0,3750.0,1,0
1,2,39.5,17.4,186.0,3800.0,0,0
2,2,40.3,18.0,195.0,3250.0,0,0
3,2,43.92193,17.15117,200.915205,4201.754386,1,0
4,2,36.7,19.3,193.0,3450.0,0,0


In [60]:
encodes.dtypes
# all encoded so check

island                 int64
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                    int64
species                int64
dtype: object

### Norming

In [62]:
from sklearn.preprocessing import StandardScaler

numeric_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

print("\nBefore norming:")
print(encodes[numeric_cols].agg(['mean', 'std']))


scaler = StandardScaler()
encodes[numeric_cols] = scaler.fit_transform(encodes[numeric_cols])


print("\nafter norming:")
print(encodes[numeric_cols].agg(['mean', 'std']))



Before norming:
      bill_length_mm  bill_depth_mm  flipper_length_mm   body_mass_g
mean   -1.156697e-15   4.131062e-16      -8.262125e-16  8.262125e-17
std     1.001457e+00   1.001457e+00       1.001457e+00  1.001457e+00

after norming:
      bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
mean        0.000000  -4.131062e-17       4.131062e-17     0.000000
std         1.001457   1.001457e+00       1.001457e+00     1.001457
