<a href="https://colab.research.google.com/github/Fikaaw/amazing-feat-eng/blob/main/classification_penguin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
import seaborn as sns
penguin = sns.load_dataset('penguins')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
penguin.shape

In [None]:
penguin.sample(2)

In [None]:
#check data statistics
penguin.describe()
# penguin.describe(include='all')

In [None]:
penguin['species'].value_counts().plot(kind='barh')
plt.show()

In [None]:
#check species distribution
penguin.groupby('species').size()

In [None]:
#check island distribution
penguin.groupby('island').size()

In [None]:
#check sex distribution
penguin.groupby('sex').size()

In [None]:
#check missing values
penguin.isnull().sum()

In [None]:
#missing data replace
new_penguin = penguin.copy()

new_penguin['bill_length_mm'].fillna(np.mean(penguin['bill_length_mm']), inplace = True)
new_penguin['bill_depth_mm'].fillna(np.mean(penguin['bill_depth_mm']), inplace = True)
new_penguin['flipper_length_mm'].fillna(np.mean(penguin['flipper_length_mm']), inplace = True)
new_penguin['body_mass_g'].fillna(np.mean(penguin['body_mass_g']), inplace = True)
new_penguin['sex'].fillna(penguin['sex'].mode()[0], inplace = True)
new_penguin.isnull().sum()

In [None]:
#simple visualization
g=sns.relplot(x='bill_length_mm',y='bill_depth_mm',data=new_penguin,hue='species',style='species')
g.fig.set_size_inches(10,5)
plt.show()

In [None]:
#simple visualization
g=sns.relplot(x='flipper_length_mm',y='body_mass_g',data=new_penguin,hue='species',style='species')
g.fig.set_size_inches(10,5)
plt.show()

In [None]:
sns.pairplot(new_penguin,hue="species")
plt.show()

In [None]:
#Feature Engineering
new_penguin_dummy = pd.get_dummies(new_penguin, columns = ['sex', 'island'], drop_first = True, dtype='int')
new_penguin_dummy.head()

In [None]:
# Select only numeric features from the DataFrame
numeric_features = new_penguin_dummy.select_dtypes(include=['number'])

# Correlation heatmap
plt.figure(figsize=(10, 5))
sns.heatmap(numeric_features.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap of Penguin Features")
plt.show()

In [None]:
#split data
from sklearn.model_selection import train_test_split

X =new_penguin_dummy.drop(['species'], axis=1)
Y =new_penguin_dummy['species']

X_train,X_test,y_train,y_test = train_test_split(X,Y ,test_size=0.30, random_state=0)

print("X_train shape :",X_train.shape)
print("Y_train shape :",y_train.shape)
print("X_test shape :",X_test.shape)
print("Y_test shape :",y_test.shape)

In [None]:
#feature Scaling
from sklearn import preprocessing

scaler=preprocessing.MinMaxScaler()

X_train_scaled=scaler.fit_transform(X_train) #Scaling and fitting the training set to a model
X_test_scaled=scaler.transform(X_test) #Transformation of testing set based off of trained scaler model

In [None]:
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train_scaled, y_train)

pred = LR.predict(X_test_scaled)

In [None]:
print("Class")
print(LR.classes_)
print("weights")
print(LR.coef_)
print("bias")
print(LR.intercept_)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print('CLASSIFICATION REPORT\n')
print(classification_report(y_test, pred))

In [None]:
print('CONFUSION MATRIX')
print(confusion_matrix(y_test, pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier()
# KNN.fit(X_train, y_train)
KNN.fit(X_train_scaled, y_train)

# pred = KNN.predict(X_test)
pred = KNN.predict(X_test_scaled)

print('CLASSIFICATION REPORT\n')
print(classification_report(y_test, pred))

print('CONFUSION MATRIX')
print(confusion_matrix(y_test, pred))