# Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Load data

In [2]:
iris = datasets.load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
target_labels = iris.target_names
target_labels

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
target_labels_dic = {
    0: 'setosa',
    1: 'versicolor',
    2: 'virginica'
}

In [6]:
X = pd.DataFrame(iris.data)
X.columns = iris.feature_names
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
y = pd.DataFrame(iris.target)
y.columns = ['target']
y.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [8]:
target_labels = iris.target_names
target_labels

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

# EDA

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [10]:
X.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [11]:
X.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [12]:
X.corr()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
sepal length (cm),1.0,-0.11757,0.871754,0.817941
sepal width (cm),-0.11757,1.0,-0.42844,-0.366126
petal length (cm),0.871754,-0.42844,1.0,0.962865
petal width (cm),0.817941,-0.366126,0.962865,1.0


In [13]:
y.value_counts()

target
0         50
1         50
2         50
Name: count, dtype: int64

# Split dataset into train/test

## **ALWAYS** before feature engineering and modeling

The primary reason for performing a train/test split before feature engineering and modeling is to prevent data leakage and to ensure that the evaluation of your model is accurate and indicative of its performance on unseen data.

- Prevention of Data Leakage
- Accurate Evaluation
- Ethical Modeling Practices

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling

Create pipelines for each model

In [15]:
pipelines = {
    'lr': Pipeline([('scaler', StandardScaler()), ('classifier', LogisticRegression())]),
    'knn': Pipeline([('scaler', StandardScaler()), ('classifier', KNeighborsClassifier())]),
    'dt': Pipeline([('scaler', StandardScaler()), ('classifier', DecisionTreeClassifier())]),
    'svm': Pipeline([('scaler', StandardScaler()), ('classifier', SVC())]),
    'nb': Pipeline([('scaler', StandardScaler()), ('classifier', GaussianNB())])
}

#### Fit and evaluate models

In [16]:
scores = {}
for model_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train.values.ravel())
    score = cross_val_score(pipeline, X_train, y_train.values.ravel(), cv=5, scoring='accuracy').mean()
    scores[model_name] = score
    print(f"{model_name} with accuracy score {score:.3f}")

lr with accuracy score 0.958
knn with accuracy score 0.933
dt with accuracy score 0.942
svm with accuracy score 0.958
nb with accuracy score 0.942


In [17]:
# Find the best model
best_model_name = max(scores, key=scores.get)
print(f"Best Model: {best_model_name} with accuracy score: {scores[best_model_name]}")

Best Model: lr with accuracy score: 0.9583333333333334


#### Hyperparameter tuning

In [18]:
hyperparameters = {'classifier__solver': ['liblinear'],
                   'classifier__penalty': ['l1', 'l2'],
                   'classifier__C': [0.01, 0.1, 1, 10, 100]
                   }

best_pipeline = pipelines[best_model_name]
grid_search = GridSearchCV(best_pipeline, hyperparameters, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train.values.ravel())

In [19]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy Score: {grid_search.best_score_}")

Best Parameters: {'classifier__C': 10, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Best Accuracy Score: 0.9583333333333334


In [20]:
# Using the best model with optimized hyperparameters
best_model = grid_search.best_estimator_

In [21]:
# Make predictions on the test set
y_pred = best_model.predict(X_test)

In [22]:
# Naming the labels
y_test_named = [target_labels_dic[i[0]] for i in y_test.values]
y_pred_named = [target_labels_dic[i] for i in y_pred]

In [23]:
print(classification_report(y_test_named, y_pred_named))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



# Deployment

In [24]:
# Save class names in the model (assuming it's a classifier)
if hasattr(best_model.named_steps['classifier'], 'classes_'):
    best_model.named_steps['classifier'].class_names_mapping = target_labels_dic

In [25]:
# Save the model and scaler, if necessary
import joblib
joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']

In [26]:
def predict(input_features, model):
    if isinstance(input_features, list):
        input_features = pd.DataFrame([input_features], columns=model.named_steps['scaler'].feature_names_in_)
    elif isinstance(input_features, np.ndarray):
        input_features = pd.DataFrame(input_features, columns=model.named_steps['scaler'].feature_names_in_)

    numeric_prediction = model.predict(input_features)

    if hasattr(model.named_steps['classifier'], 'class_names_mapping'):
        class_name_mapping = model.named_steps['classifier'].class_names_mapping
        class_name_prediction = [class_name_mapping[pred] for pred in numeric_prediction]
    else:
        class_name_prediction = numeric_prediction

    return class_name_prediction

In [27]:
# Example usage
model = joblib.load('best_model.pkl')
new_data = [4.6, 3.2, 4.6, 3.2]  # Example new data
prediction = predict(new_data, model)
print(prediction)

['virginica']
