### Import libraries

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

### Read data

In [2]:
data = pd.read_csv('penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


### Filling in missing values

In [3]:
data.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [4]:
data.drop(data[data['body_mass_g'].isnull()].index,axis=0, inplace=True)

In [5]:
data['sex'] = data['sex'].fillna('MALE')

In [6]:
data.drop(data[data['sex']=='.'].index, inplace=True)

### Creating classifier for species 

Encode 'sex' and 'island' as categorical variable

In [7]:
df = data.copy()
target = 'species'
encode = ['sex', 'island']

for col in encode:
    dummy = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df,dummy], axis=1)
    del df[col]

Encode target variable

In [8]:
target_mapper = {'Adelie':0, 'Chinstrap':1, 'Gentoo':2}
def target_encode(val):
    return target_mapper[val]

df[target] = df[target].apply(target_encode)

Separate X and y

In [9]:
X = df.drop(target, axis=1)
y = df[target]

Scale data

In [10]:
X = preprocessing.scale(X)

Split training and test data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Fit model and predict

In [12]:
model = RandomForestClassifier(n_estimators = 200, max_depth = 10, min_samples_split = 2, min_samples_leaf = 1)
model.fit(X_train, y_train)
pred = model.predict(X_test)

Check performance of model

In [13]:
print('CONFUSION MATRIX')
print(confusion_matrix(y_test, pred))

CONFUSION MATRIX
[[33  1  0]
 [ 0  9  0]
 [ 0  0 26]]
