In [29]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd
import joblib

##### Loading dataset from scikit-learn datsets

In [30]:
iris = load_iris()

In [31]:
data = pd.DataFrame(iris.data)
data.columns = iris.feature_names
data.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


###### Trying to save indices of columns which helps in tracking the order of columns

In [33]:
features = iris['feature_names']
features


['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [34]:
X = iris.data
y = iris.target

##### Train-Test Split

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.3, 
                                                    random_state=667
                                                    )

In [36]:
print('X_train:',X_train.shape,'\ny_train:',y_train.shape,'\nX_test:',X_test.shape,'\ny_test:',y_test.shape)

X_train: (105, 4) 
y_train: (105,) 
X_test: (45, 4) 
y_test: (45,)


##### Creating a Pipeline
<br>Pipeline helps to automate ML workflows which is a sequence of data to be transformed and correlated together in a model that can be tested and evaluated to achieve an outcome.

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [38]:
sc = StandardScaler()
rf = RandomForestClassifier(random_state=667)

In [39]:
pipe = Pipeline([
                #('Scaler',sc), 
                 ('rf', rf)
                ])

##### Grid Search

In [40]:
#n_components = [2,5,7,9,10]

params_grid = {
    'rf__n_estimators': [3,5,10],
    'rf__max_depth': [3,5,7,9]
     }

In [41]:
estimator = GridSearchCV(pipe, params_grid,cv=5, n_jobs=1, scoring= 'r2')
estimator.fit(X_train,y_train)

print('Best parameters:',estimator.best_params_, '\n\nr2:',estimator.best_score_)


Best parameters: {'rf__max_depth': 3, 'rf__n_estimators': 3} 

r2: 0.9085084033613444


In [42]:
model = estimator.best_estimator_
model.fit(X,y)


In [43]:
joblib.dump(model,'iris.pkl')

['iris.pkl']