In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [38]:
df = pd.read_csv('/content/penguins_size.csv')
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [39]:
df.describe()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [41]:
df['culmen_length_mm'].fillna(df['culmen_length_mm'].mean(), inplace=True)
df['culmen_depth_mm'].fillna(df['culmen_depth_mm'].mean(), inplace=True)
df['flipper_length_mm'].fillna(df['flipper_length_mm'].mean(), inplace=True)
df['body_mass_g'].fillna(df['body_mass_g'].mean(), inplace=True)

In [42]:
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,MALE
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,FEMALE
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,FEMALE
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386,
340,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000,FEMALE
341,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000,MALE
342,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000,FEMALE


In [43]:
df[df['sex']=='.']

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [44]:
df.loc[336,'sex'] = 'FEMALE'

In [45]:
df.loc[336,'sex'] = 'FEMALE'
df["species"].replace({
    'Adelie': 0, 
    'Gentoo': 1, 
    'Chinstrap': 2
}, inplace=True)

df["island"].replace({
    'Biscoe': 0, 
    'Dream': 1, 
    'Torgersen': 2
}, inplace=True)

df["sex"].replace({
    'MALE': 0, 
    'FEMALE': 1
}, inplace=True)

In [46]:
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.10000,18.70000,181.000000,3750.000000,0.0
1,0,2,39.50000,17.40000,186.000000,3800.000000,1.0
2,0,2,40.30000,18.00000,195.000000,3250.000000,1.0
3,0,2,43.92193,17.15117,200.915205,4201.754386,
4,0,2,36.70000,19.30000,193.000000,3450.000000,1.0
...,...,...,...,...,...,...,...
339,1,0,43.92193,17.15117,200.915205,4201.754386,
340,1,0,46.80000,14.30000,215.000000,4850.000000,1.0
341,1,0,50.40000,15.70000,222.000000,5750.000000,0.0
342,1,0,45.20000,14.80000,212.000000,5200.000000,1.0


In [47]:
df_to_be_scaled = df.drop(['island','sex'],axis=1)
target = df_to_be_scaled.species
df_feat= df_to_be_scaled.drop('species',axis=1)

In [56]:
scaler = StandardScaler()
scaler.fit(df_feat)
df_scaled = scaler.transform(df_feat)
df_scaled = pd.DataFrame(df_scaled,columns=df_feat.columns[:4])
df_preprocessed = pd.concat([df_scaled,df['island'],df['sex'],target],axis=1)
df_preprocessed.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island,sex,species
0,-0.8870812,0.7877425,-1.422488,-0.565789,2,0.0,0
1,-0.813494,0.1265563,-1.065352,-0.503168,2,1.0,0
2,-0.6663195,0.4317192,-0.422507,-1.192003,2,1.0,0
3,1.307172e-15,1.806927e-15,0.0,0.0,2,,0
4,-1.328605,1.092905,-0.565361,-0.941517,2,1.0,0


In [58]:
dfnonan=df_preprocessed.fillna(0)

In [59]:
dfnonan.isnull().values.any()

False

In [60]:
x = dfnonan.drop(columns=['species'], axis=1)
y = dfnonan["species"]
X_train1, X_validate, Y_train1, Y_validate = train_test_split(x,y, test_size=0.1, random_state=2)
X_train, X_test, Y_train, Y_test = train_test_split(X_train1,Y_train1, test_size=0.1111, random_state=2)

In [61]:
X_train

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island,sex
250,0.621457,-0.941514,1.506028,1.312851,0,0.0
303,1.118171,-0.636351,1.648883,1.438093,0,0.0
326,-0.408764,-1.246677,0.648902,0.624016,0,1.0
21,-1.144637,0.787743,-1.493915,-0.753653,0,0.0
123,-0.463955,0.686022,0.077484,-0.409236,2,0.0
...,...,...,...,...,...,...
338,0.603060,-1.755281,0.934611,0.905812,0,1.0
29,-0.629526,0.889463,-1.493915,-0.315304,0,0.0
126,-0.942272,0.228277,-0.708216,-1.160692,2,1.0
12,-0.519145,0.228277,-1.351061,-1.254624,2,1.0


In [62]:
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 274 entries, 250 to 197
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   culmen_length_mm   274 non-null    float64
 1   culmen_depth_mm    274 non-null    float64
 2   flipper_length_mm  274 non-null    float64
 3   body_mass_g        274 non-null    float64
 4   island             274 non-null    int64  
 5   sex                274 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 15.0 KB


In [125]:
model_knn = KNeighborsClassifier().fit(X_train, Y_train)


In [126]:
knn = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
k_range = list(range(1, 70))
param_grid = dict(n_neighbors=k_range)
  

grid = GridSearchCV(model_knn, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)

grid_search=grid.fit(X_train, Y_train)

Fitting 10 folds for each of 69 candidates, totalling 690 fits


In [127]:
print(grid_search.best_params_)

{'n_neighbors': 5}


In [128]:
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

Accuracy for our training dataset with tuning is : 99.27%


In [134]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train1, Y_train1)

y_test_hat=knn.predict(X_validate) 

test_accuracy=accuracy_score(Y_validate,y_test_hat)*100

print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

Accuracy for our testing dataset with tuning is : 97.14%


# **Conclusion**
Se tiene un accuracy del 97.14% pero fue nesesario asumir muchos datos los cuales el dataset no tenea como valores nan o valores que no deberian estar en este.