In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn import svm 
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.metrics import confusion_matrix 
from sklearn.svm import SVC
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv('/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv')
df.head()

let us now play with this data and see what we can get out of it

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
# it's better to remove to null value in sex right now since it's the target column
df.dropna(subset=['sex'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.sex.value_counts()

let's delete the row where the sex = ".", since it would not make a diffrence

In [None]:
df = df[df.sex != "."]

Let's see the distribution of the numerical values

In [None]:
df.describe()

In [None]:
fig, ax = plt.subplots(2,2, figsize=(10,8))
sns.histplot(data = df, x='culmen_length_mm', kde=True, ax=ax[0][0])
sns.histplot(data = df, x='culmen_depth_mm', kde=True, ax=ax[0][1])
sns.histplot(data = df, x='flipper_length_mm', kde=True, ax=ax[1][0])
sns.histplot(data = df, x='body_mass_g', kde=True, ax=ax[1][1])

## Species

In [None]:
df.species.value_counts()

In [None]:
_, ax = plt.subplots(1,2, figsize=(15,5)) 
df.species.value_counts().plot(kind='pie', autopct='%1.1f%%', ax=ax[0])
sns.countplot(data=df, x='species', hue='sex', ax=ax[1])

It seems like Adelie takes **44.2%** of the total propotion follow by Gentoo with **36%** they both have the same propotion in sex at last we have Gentoo with 19.8% and a little diffrence in the sex

In [None]:
plt.figure(figsize=(15,5))
ax = sns.countplot(data=df, x='island', hue='species')
# show count
for container in ax.containers:
    ax.bar_label(container)

Interesting, the adelie lives in all of the three islands with smaller count in Biscoe island(44), whereas Chinstrap lives only Dream island the same thing for Gentoo, it lives only in Biscoe island with highest count 124

In [None]:
fig, ax = plt.subplots(2,2, figsize=(10,10))
sns.boxplot(data=df, x='species', y='culmen_length_mm', hue='sex', ax=ax[0][0], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})
sns.boxplot(data=df, x='species', y='culmen_depth_mm', hue='sex', ax=ax[0][1], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})
sns.boxplot(data=df, x='species', y='flipper_length_mm', hue='sex', ax=ax[1][0], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})
sns.boxplot(data=df, x='species', y='body_mass_g', hue='sex', ax=ax[1][1], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})

### **Observations:**
* for all of the four numerical features always the male has overall higher values than the female with one exception for a Chinstrap female that has higher culmen length over all the males

####  **culmen length:**
    1. Adelie penguins have lower culmen length that Chinstrap and Gento
    2. The penguin with the higher culmen length is a male Gentoo (59.6 mm)
    3. the lowest culmen length penguin is a female adelie (32.1 mm) 
    4. female Gentoo and female Chinstrap have closer culment length

#### **culmen depth:**
    1. Gentoo penguins have lower culmen depth that Chinstrap and Adelie
    2. The penguin with the higher culmen depth is a male adelie (21.5 mm)
    3. the lowest culmen depth penguin is a female Gentoo (13.1 mm)
    
#### **flipper length:**
    1. Adelie penguins have lowest flipper length that Chinstrap and Gentoo
    2. Gentoo penguins have overall the higher flipper length
    2. The penguin with the higher flipper length is a male adelie (231.0 mm)
    3. the lowest flipper length penguin is a female Gentoo (172.0 mm)
    
#### **body mass:**
    1. Adelie and Chinstrap penguins have similar range of body mass
    2. the penguin with the higher body mass is a male Gento with 6300.0 g
    2. the penguin with the lowest body mass is female Chinstrap with 2700.0 g

## Island

In [None]:
fig, ax = plt.subplots(1,2,figsize=(15,5))
plot_ = sns.countplot(data=df, x='island', ax=ax[0])
plot_.bar_label(plot_.containers[0])
df.island.value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%')

We can observe that most of the penguins are located in Biscoe island *(48.9%)* followed by Dream *(36.9%)* and then torgersen *(14.1%)*

In [None]:
fig, ax = plt.subplots(2,2, figsize=(10,10))
sns.boxplot(data=df, x='island', y='culmen_length_mm', ax=ax[0][0], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})
sns.boxplot(data=df, x='island', y='culmen_depth_mm', ax=ax[0][1], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})
sns.boxplot(data=df, x='island', y='flipper_length_mm', ax=ax[1][0], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})
sns.boxplot(data=df, x='island', y='body_mass_g', ax=ax[1][1], linewidth=.8,showmeans=True, meanprops={"marker":"o",
                                                                                                "markerfacecolor":"white",
                                                                                                "markeredgecolor":"black",
                                                                                                "markersize":"5"})

### **Observations:**
    1. Biscoe island has almost the highest clumen length & flipper length & body mass among all the other islands with the lowest clumen depth
    2. Torgersen and Dreams islans seems to have approximately the same body mass, flipper length and culmen depth
    

In [None]:
df.head()

In [None]:
sns.pairplot(df )

we can observe that there is a positive relation between flipper length and the body mass, let's check the correlation

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True)

that proves what we said before the correlation between flipper length and the body mass is 0.87 

### preprocessing

In [None]:
df.sex = df.sex.apply(lambda x: 1 if x=='MALE' else 0)
df.sex.value_counts()

In [None]:
df.body_mass_g = df.body_mass_g.apply(lambda x: x/1000)
df.rename(columns = {'body_mass_g':'body_mass_kg'}, inplace = True)
df.body_mass_kg.describe()

In [None]:
df = pd.get_dummies(df, prefix='island', columns=['island'])

In [None]:
df.head()

In [None]:
X = df.drop("species", axis=1)
Y = df['species']

## Predictive Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
models = {
    'model': [],
    'accuracy': []
}

### Decision Tree

In [None]:
parameters = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, parameters, cv=4)
tree_cv.fit(X_train, y_train)
print(tree_cv.best_estimator_)

In [None]:
models['model'].append('Decision Tree')
models['accuracy'].append(tree_cv.best_score_)

### Random Forest

In [None]:
parameters = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2, 5]
}
random_forest = RandomForestClassifier()
random_forest_cv = GridSearchCV(random_forest, parameters, cv=4)
random_forest_cv.fit(X_train, y_train)
print(random_forest_cv.best_estimator_)

In [None]:
models['model'].append('Random Forest')
models['accuracy'].append(random_forest_cv.best_score_)

### XGboost

In [None]:
lr = LabelEncoder()
Y = lr.fit_transform(Y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
parameters = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}
xgb = XGBClassifier(objective='multi:softmax')
xgb_cv = GridSearchCV(xgb, parameters, cv=4)
xgb_cv.fit(X_train, y_train)
print(xgb_cv.best_estimator_)

In [None]:
models['model'].append('XGboost')
models['accuracy'].append(xgb_cv.best_score_)

### SVM

In [None]:
parameters = {
    'C': [0.1, 1, 10],
    'kernel': ['poly', 'rbf'],
    'gamma': ['scale', 'auto']
}
svm = SVC()
svm_cv = GridSearchCV(svm, parameters)
svm_cv.fit(X_train, y_train)
svm_cv.best_estimator_

In [None]:
models['model'].append('SVM')
models['accuracy'].append(svm_cv.best_score_)

### Logistic Regression

In [None]:
parameters = {'C': [0.1, 1, 10]}
lr = LogisticRegression(solver='liblinear')
lr_cv = GridSearchCV(lr, parameters)
lr_cv.fit(X_train, y_train)
lr_cv.best_estimator_

In [None]:
models['model'].append('Logistic Regression')
models['accuracy'].append(lr_cv.best_score_)

In [None]:
models = pd.DataFrame(models)

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=models, x='accuracy', y= 'model')

In [None]:
lr_cv.best_params_

In [None]:
lr = LogisticRegression(C=0.1, solver='liblinear')
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)