In [16]:
import numpy as np
import pandas as pd
# import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


In [17]:
df = pd.read_csv(
    'F:\clustering-classification-dashboard\data\penguins_size.csv')


In [18]:
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [19]:
df.shape

(344, 7)

In [20]:
df.species.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [21]:
df.isna().sum() / df.shape[0] * 100

species              0.000000
island               0.000000
culmen_length_mm     0.581395
culmen_depth_mm      0.581395
flipper_length_mm    0.581395
body_mass_g          0.581395
sex                  2.906977
dtype: float64

In [22]:
filled_cols=['culmen_length_mm','culmen_depth_mm','flipper_length_mm','body_mass_g']
for col in filled_cols:
    df[col]=df[col].fillna(df[col].mean())


In [23]:
df['sex']=df['sex'].fillna(df['sex'].mode()[0])

In [24]:
df.isna().sum() / df.shape[0] * 100


species              0.0
island               0.0
culmen_length_mm     0.0
culmen_depth_mm      0.0
flipper_length_mm    0.0
body_mass_g          0.0
sex                  0.0
dtype: float64

In [25]:
df['sex'].value_counts()

MALE      178
FEMALE    165
.           1
Name: sex, dtype: int64

In [26]:
df.loc[df['sex']=='.', 'sex'] = 'FEMALE'


In [27]:
# sns.countplot(data=df, x='species');

In [28]:
# sns.countplot(data=df, x='island');

In [29]:
# sns.countplot(data=df, x='sex');

In [30]:
df.to_csv('F:\clustering-classification-dashboard\data\penguins_cleaned.csv',index=False)

In [31]:
df['species']=df['species'].map({'Adelie':0,'Gentoo':1,'Chinstrap':2})

In [32]:
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,0,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,0,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,0,Torgersen,43.92193,17.15117,200.915205,4201.754386,MALE
4,0,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [33]:
from sklearn.preprocessing import OneHotEncoder
nominal_cols = ['sex', 'island']
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
oh_cols_np = oh_encoder.fit_transform(df[nominal_cols])

columns = []
for col, cats in zip(nominal_cols, oh_encoder.categories_):
    for cat in cats:
        columns.append(col + '_' + str(cat))

oh_cols_df = pd.DataFrame(oh_cols_np, columns=columns)

# Bring back the index (if exist) since OHE removes it
oh_cols_df.index = df.index

# Drop the original nominal columns
rest_df = df.drop(columns=nominal_cols)

# Add the OHE column
df = pd.concat([rest_df, oh_cols_df], axis=1)


In [34]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(df, test_size=0.20, random_state=2022,stratify=df['species'])

In [35]:
train_data.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_FEMALE,sex_MALE,island_Biscoe,island_Dream,island_Torgersen
274,1,46.5,14.4,217.0,4900.0,1.0,0.0,1.0,0.0,0.0
246,1,44.5,14.3,216.0,4100.0,0.0,1.0,1.0,0.0,0.0
190,2,46.9,16.6,192.0,2700.0,1.0,0.0,0.0,1.0,0.0
13,0,38.6,21.2,191.0,3800.0,0.0,1.0,0.0,0.0,1.0
325,1,46.8,16.1,215.0,5500.0,0.0,1.0,1.0,0.0,0.0


In [36]:
train_data['species'].value_counts()

0    122
1     99
2     54
Name: species, dtype: int64

In [37]:

# encoder=OneHotEncoder()

# enc_data = pd.DataFrame(encoder.fit_transform(
#     train_data[['sex', 'island']]).toarray())
# enc_data.index=train_data.index
# train_data = train_data.join(enc_data)

# train_data = train_data.drop(columns=['island', 'sex']).rename(columns={
#     0: 'SEX_MALE', 1: 'SEX_FEMALE', 2: 'island_Dream', 3: 'island_Biscoe', 4: 'island_Torgersen'})

# enc_val_data = pd.DataFrame(encoder.transform(
#     val_data[['sex', 'island']]).toarray())
# enc_val_data.index=val_data.index
# val_data = val_data.join(enc_val_data)
# val_data = val_data.drop(columns=['island', 'sex']).rename(columns={
#     0: 'SEX_MALE', 1: 'SEX_FEMALE', 2: 'island_Dream', 3: 'island_Biscoe', 4: 'island_Torgersen'})


In [38]:
train_data.to_csv(
    'F:\\clustering-classification-dashboard\\data\\train_data_without_anything.csv',index=False)
val_data.to_csv(
    'F:\\clustering-classification-dashboard\\data\\val_data_without_anything.csv',index=False)


In [39]:
train_data_min_max = train_data.copy()
val_data_min_max = val_data.copy()
train_data_std = train_data.copy()
val_data_std = val_data.copy()
train_data_robust=train_data.copy()
val_data_robust=val_data.copy()

In [40]:
from sklearn.preprocessing import MinMaxScaler
num_cols = ['culmen_length_mm', 'culmen_depth_mm',
            'flipper_length_mm', 'body_mass_g']
min_max_scaler = MinMaxScaler()
train_data_min_max[num_cols] = min_max_scaler.fit_transform(
    train_data_min_max[num_cols])
val_data_min_max[num_cols]= min_max_scaler.transform(val_data_min_max[num_cols])


In [41]:
train_data_min_max.to_csv(
    'F:\\clustering-classification-dashboard\\data\\train_data_with_min_max_scaler.csv',index=False)
val_data_min_max.to_csv(
    'F:\\clustering-classification-dashboard\\data\\val_data_with_min_max_scaler.csv',index=False)


In [42]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
train_data_std[num_cols] = std_scaler.fit_transform(train_data_std[num_cols])
val_data_std[num_cols] = std_scaler.transform(val_data_std[num_cols])


In [43]:
train_data_std.to_csv(
    'F:\\clustering-classification-dashboard\\data\\train_data_with_std_scaler.csv',index=False)
val_data_std.to_csv(
    'F:\\clustering-classification-dashboard\\data\\val_data_with_std_scaler.csv', index=False)


In [44]:

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
train_data_robust[num_cols] = scaler.fit_transform(train_data_robust[num_cols])
val_data_robust[num_cols] = scaler.fit_transform(val_data_robust[num_cols])

In [45]:
train_data_robust.to_csv(
    'F:\\clustering-classification-dashboard\\data\\train_data_with_robust_scaler.csv',index=False)
val_data_robust.to_csv(
    'F:\\clustering-classification-dashboard\\data\\val_data_with_robust_scaler.csv', index=False)


In [46]:
X_train = train_data.drop(columns=['species'])
y_train = train_data['species']
X_val = val_data.drop(columns=['species'])
y_val = val_data['species']


In [47]:
from imblearn.over_sampling import SMOTE
oversampling=SMOTE()
X_train,y_train=oversampling.fit_resample(X_train,y_train)

In [48]:
y_train.value_counts()

1    122
2    122
0    122
Name: species, dtype: int64

In [49]:
X_val.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_FEMALE,sex_MALE,island_Biscoe,island_Dream,island_Torgersen
41,40.8,18.4,195.0,3900.0,0.0,1.0,0.0,1.0,0.0
88,38.3,19.2,189.0,3950.0,0.0,1.0,0.0,1.0,0.0
94,36.2,17.3,187.0,3300.0,1.0,0.0,0.0,1.0,0.0
33,40.9,18.9,184.0,3900.0,0.0,1.0,0.0,1.0,0.0
194,50.9,19.1,196.0,3550.0,0.0,1.0,0.0,1.0,0.0
