In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,recall_score,precision_score,classification_report
from sklearn.metrics import confusion_matrix,roc_curve,f1_score,multilabel_confusion_matrix
from sklearn.metrics import precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

In [3]:
df= pd.read_csv("Iris.csv")
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
df.drop("Id",axis=1,inplace=True)

In [5]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [7]:
df.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [8]:
df.corr()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
SepalLengthCm,1.0,-0.109369,0.871754,0.817954
SepalWidthCm,-0.109369,1.0,-0.420516,-0.356544
PetalLengthCm,0.871754,-0.420516,1.0,0.962757
PetalWidthCm,0.817954,-0.356544,0.962757,1.0


In [9]:
df["Species"].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

### split the data

In [10]:
x=df.drop("Species",axis=1)
y=df["Species"]

In [11]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=45,stratify=y)

In [12]:
rf_model = RandomForestClassifier()

rf_model.fit(x_train,y_train)

### evaluate

In [13]:
#train data
yp_train=rf_model.predict(x_train)
cnf_matrix=confusion_matrix(y_train,yp_train)
print("confusion_matrix\n",cnf_matrix)
accuracy=accuracy_score(y_train,yp_train)
print("Accuracy",accuracy)
# recall=recall_score(y_train,yp_train)
# print('recall',recall)
# precision=precision_score(y_train,yp_train)
# print("precision",precision)
clf_rpt=classification_report(y_train,yp_train)
print("classification_report\n",clf_rpt)

confusion_matrix
 [[40  0  0]
 [ 0 40  0]
 [ 0  0 40]]
Accuracy 1.0
classification_report
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        40
Iris-versicolor       1.00      1.00      1.00        40
 Iris-virginica       1.00      1.00      1.00        40

       accuracy                           1.00       120
      macro avg       1.00      1.00      1.00       120
   weighted avg       1.00      1.00      1.00       120



In [14]:
#test data
yp_test=rf_model.predict(x_test)
cnf_matrix=confusion_matrix(y_test,yp_test)
print("confusion_matrix\n",cnf_matrix)
accuracy=accuracy_score(y_test,yp_test)
print("Accuracy",accuracy)
# recall=recall_score(y_test,yp_test)
# print('recall',recall)
# precision=precision_score(y_test,yp_test)
# print("precision",precision)
clf_rpt=classification_report(y_test,yp_test)
print("classification_report\n",clf_rpt)

confusion_matrix
 [[10  0  0]
 [ 0  9  1]
 [ 0  2  8]]
Accuracy 0.9
classification_report
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.82      0.90      0.86        10
 Iris-virginica       0.89      0.80      0.84        10

       accuracy                           0.90        30
      macro avg       0.90      0.90      0.90        30
   weighted avg       0.90      0.90      0.90        30



In [15]:
multilabel_confusion_matrix(y_test,yp_test)

array([[[20,  0],
        [ 0, 10]],

       [[18,  2],
        [ 1,  9]],

       [[19,  1],
        [ 2,  8]]])

In [16]:
#df.iloc[0,:]
SepalLengthCm   =         5.1
SepalWidthCm    =         3.5
PetalLengthCm   =         1.4
PetalWidthCm    =         0.2

In [17]:
array=np.array([SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm])
array

array([5.1, 3.5, 1.4, 0.2])

In [18]:
rf_model.predict([array])[0]

'Iris-setosa'

In [21]:
rf_model.predict([[5.6,2.5,3.9,1.1]])[0]

'Iris-versicolor'

In [24]:
df.sample(5)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
39,5.1,3.4,1.5,0.2,Iris-setosa
69,5.6,2.5,3.9,1.1,Iris-versicolor
138,6.0,3.0,4.8,1.8,Iris-virginica
33,5.5,4.2,1.4,0.2,Iris-setosa


In [22]:
import json
import pickle
import joblib

In [26]:
prj_data={"columns":list(x.columns)}

In [None]:
# with open ("prj_data.json","w") as f:
#     json.dump(prj_data,f)

In [None]:
# with open ("rf_model.pkl","wb") as f:
#     pickle.dump(rf_model,f)

In [23]:
model=joblib.dump(rf_model,'iris_model.joblib')

/bin/bash: line 1: joblib: command not found
