In [236]:
import pandas as pd

import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import pickle

In [229]:
def load_clean_data(csv): 
    
    df = pd.read_csv(csv)
    
    df.dropna(inplace = True)
    
    df = df[df['sex'] != '.']
    
    return df

In [230]:
def build_model(): 
    
    # Encode dummy variables for categorical data
    categorical_transformer = Pipeline([
        ('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

    # CLF pipeline
    clf = Pipeline([
        ('transform', preprocessor),
        ('clf', RandomForestClassifier())
    ])
    
    param_grid = {
        'clf__n_estimators': [50,100]
    }

    cv = GridSearchCV(estimator= clf, param_grid= param_grid)
    
    return cv
    

In [231]:
def display_results(model, y_test, y_pred):
    confusion_mat = confusion_matrix(y_test, y_pred, labels= np.unique(y_pred))
    accuracy = accuracy_score(y_test, y_pred)

    print("Labels:", np.unique(y_pred))
    #print("Confusion Matrix:\n", confusion_matrix
    print("Accuracy: {}%" .format(round(accuracy*100,2)))
    print("\nBest Parameters:", cv.best_params_)
    print(ConfusionMatrixDisplay(confusion_mat, display_labels= np.unique(y_pred)).plot())
    


In [232]:
def main():
    df = load_clean_data('penguins_size.csv')
    
    features = df.drop('species', axis = 1)

    target = df['species']

    numeric_features_vals = features.select_dtypes(include=[float, int])

    numeric_features = numeric_features_vals.columns

    categorical_features_vals = features.select_dtypes(include = [object])
    
    categorical_features = categorical_features_vals.columns

    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size= 0.25, random_state= 123)
    
    model = build_model()
    
    model.fit(x_train, y_train)
    
    return model
    
    

In [233]:
main()

GridSearchCV(estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['island', 'sex'], dtype='object'))])),
                                       ('clf', RandomForestClassifier())]),
             param_grid={'clf__n_estimators': [50, 100]})

In [235]:
model = main()

model

GridSearchCV(estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['island', 'sex'], dtype='object'))])),
                                       ('clf', RandomForestClassifier())]),
             param_grid={'clf__n_estimators': [50, 100]})

In [237]:
with open("penguin_model_rf.pkl", "wb") as file: 
    pickle.dump(model, file)

In [238]:
import flask

In [242]:
with open("penguin_model_rf.pkl", "rb") as f: 
    mod = pickle.load(f)

In [243]:
mod

GridSearchCV(estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['island', 'sex'], dtype='object'))])),
                                       ('clf', RandomForestClassifier())]),
             param_grid={'clf__n_estimators': [50, 100]})

In [245]:
mod.predict(sample)

array(['Gentoo'], dtype=object)

In [246]:
sample2

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Biscoe,45.6,20.3,190,4500.0,Male


In [249]:
input_variables = pd.DataFrame([['Biscoe', 45.6, 20.3, 190, 4500.0, 'Male']],
                              columns=['island',
                                            'culmen_length_mm',
                                            'culmen_depth_mm',
                                            'flipper_length_mm',
                                            'body_mass_g',
                                            'sex'],
                              index = ['input'])

input_variables

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
input,Biscoe,45.6,20.3,190,4500.0,Male


In [250]:
mod.predict(input_variables)

array(['Gentoo'], dtype=object)

In [None]:
    y_pred = model.predict(x_test)
    
    return display_results(model, y_test, y_pred)

In [179]:
x_test

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
111,Biscoe,45.6,20.3,191.0,4600.0,MALE
158,Dream,46.1,18.2,178.0,3250.0,FEMALE
288,Biscoe,43.5,14.2,220.0,4700.0,FEMALE
308,Biscoe,47.5,14.0,212.0,4875.0,FEMALE
185,Dream,51.0,18.8,203.0,4100.0,MALE
...,...,...,...,...,...,...
321,Biscoe,55.9,17.0,228.0,5600.0,MALE
77,Torgersen,37.2,19.4,184.0,3900.0,MALE
207,Dream,52.2,18.8,197.0,3450.0,MALE
245,Biscoe,46.1,15.1,215.0,5100.0,MALE


In [204]:
sample = pd.DataFrame(data = {"island":"Biscoe", 
                       "culmen_length_mm":45.6, 
                       "culmen_depth_mm":20.3, 
                       "flipper_length_mm":190, 
                       "body_mass_g":4500.0, 
                       "sex":"Male"}, index = [0])



In [213]:
sample2 = pd.DataFrame(np.array(["Biscoe", 45.6, 20.3, 190, 4500.0, "Male"]).reshape(1,-1), 
                      columns = features.columns)

sample2

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Biscoe,45.6,20.3,190,4500.0,Male


In [205]:
sample

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Biscoe,45.6,20.3,190,4500.0,Male


In [191]:
x_test.loc[111].shape

(6,)

In [218]:
x = 111

x = str(x)

x

'111'

In [183]:
model = main()

In [220]:
model.predict(sample2)[0]

'Gentoo'

In [221]:
def classify_penguin(a, b, c, d, e, f):
    
    
    arr = np.array([str(a), float(b), float(c), float(d), float(e), str(f)]).reshape(1, -1)
    
    sample = pd.DataFrame(arr, columns = ['island', 
                                          'culmen_length_mm', 
                                          'culmen_depth_mm', 
                                          'flipper_length_mm',
                                          'body_mass_g', 
                                          'sex'])
    
    return model.predict(sample)[0]

In [227]:
classify_penguin('Biscoe', 45.6, 20.3, 190, 4500.0, 'Male')

'Gentoo'

In [226]:
sample.values[0]

array(['Biscoe', 45.6, 20.3, 190, 4500.0, 'Male'], dtype=object)