Problem Description
Use sklearn.datasets iris flower dataset to train your model using logistic regression. You need
to figure out the accuracy of your model and use that to predict different samples in your test
dataset. In iris dataset there are 150 samples containing following features,
1. Sepal Length
2. Sepal Width
3. Petal length
4. Petal width
Using above 4 features you will classify a flower in one of the three categories,
1. Setosa
2. Versicolour
3. Virginica


In [17]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,confusion_matrix,f1_score,classification_report
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV

In [None]:
iris = load_iris()
print("Dir : ",dir(iris))
print("Feature_names : ",iris.feature_names)
print("Target_names  : ",iris.target_names)
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['Category'] = iris.target

df.sample(5)

In [19]:
print("The shape of this dataset is ",df.shape)

The shape of this dataset is  (150, 5)


In [20]:
print("No. of elements in this dataset is ",df.size)

No. of elements in this dataset is  750


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Category           150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


# Data Cleaning

In [22]:
df[df.duplicated()]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Category
142,5.8,2.7,5.1,1.9,2


In [23]:
df.drop_duplicates(inplace=True)
df[df.duplicated()]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Category


# Splitting the dataset

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(df[iris.feature_names],df["Category"], test_size=0.2)
print('There are {} samples in the training set and {} samples in the test set'.format(X_train.shape[0], X_test.shape[0]))

There are 119 samples in the training set and 30 samples in the test set


# Different types of Classification Models

# 1 - Logistic regression model

In [25]:
LRmodel = LogisticRegression()
LRmodel.fit(X_train, Y_train)
LRmodel_Pred = LRmodel.predict(X_test)
print("Train Score : ",LRmodel.score(X_train,Y_train))
print("Score : ",LRmodel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, LRmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, LRmodel_Pred))
print('F1 score : ', f1_score(Y_test, LRmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, LRmodel_Pred))


Train Score :  0.9747899159663865
Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# 2 - Decision Tree Classifier Model

In [26]:
from sklearn.tree import DecisionTreeClassifier

DTmodel = DecisionTreeClassifier()
DTmodel.fit(X_train,Y_train)
DTmodel_Pred = DTmodel.predict(X_test)
print("Train Score : ",DTmodel.score(X_train,Y_train))
print("Score : ",DTmodel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, DTmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, DTmodel_Pred))
print('F1 score : ', f1_score(Y_test, DTmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, DTmodel_Pred))


Train Score :  1.0
Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# Hyper Parameter tuning of Decision Tree Classifier

In [27]:
from sklearn.tree import DecisionTreeClassifier

parameter={
    'criterion':['gini','entropy','log_loss'],
    'max_depth':[3,4,5,6,7,8,9,10],
    'max_features':['auto','sqrt','log2'],
    'splitter':['best', 'random']
}

DTmodel = DecisionTreeClassifier()

cv=GridSearchCV(DTmodel,parameter,scoring='accuracy')
cv.fit(X_train,Y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",cv.best_estimator_)
print("\n The best score across ALL searched params:\n",cv.best_score_)
print("\n The best parameters across ALL searched params:\n",cv.best_params_)

cv_model=cv.best_estimator_
DTmodel.fit(X_train, Y_train)
DTmodel_Pred = cv_model.predict(X_test)

print("\nScore : ",cv_model.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, DTmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, DTmodel_Pred))
print('F1 score : ', f1_score(Y_test, DTmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, DTmodel_Pred))


 Results from Grid Search 

 The best estimator across ALL searched params:
 DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='sqrt',
                       splitter='random')

 The best score across ALL searched params:
 0.9583333333333334

 The best parameters across ALL searched params:
 {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'splitter': 'random'}

Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python3

# 3 - Random Forest Classifier Model

In [28]:
from sklearn.ensemble import RandomForestClassifier 


RFCmodel = RandomForestClassifier()


RFCmodel.fit(X_train, Y_train)
RFCmodel_Pred = RFCmodel.predict(X_test)

print("Train Score : ",RFCmodel.score(X_train,Y_train))
print("Score : ",RFCmodel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, RFCmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, RFCmodel_Pred))
print('F1 score : ', f1_score(Y_test, RFCmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, RFCmodel_Pred))


Train Score :  1.0
Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# Hyper Parameter tuning of Random Forest Classifier 

In [29]:
from sklearn.ensemble import RandomForestClassifier 

parameter={
    'n_estimators':[10,50,100],
    'criterion':['gini','entropy','log_loss'],
    'max_depth':np.arange(2,10,2),
    'max_features':[None,'sqrt','log2'],
    "bootstrap": [True, False]
    
}

RFCmodel = RandomForestClassifier()

cv=GridSearchCV(RFCmodel,parameter,scoring='accuracy')
cv.fit(X_train,Y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",cv.best_estimator_)
print("\n The best score across ALL searched params:\n",cv.best_score_)
print("\n The best parameters across ALL searched params:\n",cv.best_params_)

cv_model=cv.best_estimator_
cv_model.fit(X_train, Y_train)
RFCmodel_Pred = cv_model.predict(X_test)


print("Score : ",cv_model.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, RFCmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, RFCmodel_Pred))
print('F1 score : ', f1_score(Y_test, RFCmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, RFCmodel_Pred))


 Results from Grid Search 

 The best estimator across ALL searched params:
 RandomForestClassifier(max_depth=6, max_features=None, n_estimators=10)

 The best score across ALL searched params:
 0.9666666666666668

 The best parameters across ALL searched params:
 {'bootstrap': True, 'criterion': 'gini', 'max_depth': 6, 'max_features': None, 'n_estimators': 10}
Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# 4 - Ada Boost Classifier Model

In [30]:
from sklearn.ensemble import AdaBoostClassifier 
AdaBCmodel = AdaBoostClassifier()
AdaBCmodel.fit(X_train, Y_train)
AdaBCmodel_Pred = AdaBCmodel.predict(X_test)
print("Train Score : ",AdaBCmodel.score(X_train,Y_train))
print("Score : ",AdaBCmodel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, AdaBCmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, AdaBCmodel_Pred))
print('F1 score : ', f1_score(Y_test, AdaBCmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, AdaBCmodel_Pred))


Train Score :  0.9747899159663865
Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# 5 - Gradient Boosting Classifier Model

In [31]:
from sklearn.ensemble import GradientBoostingClassifier
GBCmodel = GradientBoostingClassifier()
GBCmodel.fit(X_train, Y_train)
GBCmodel_Pred = GBCmodel.predict(X_test)
print("Train Score : ",GBCmodel.score(X_train,Y_train))
print("Score : ",GBCmodel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, GBCmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, GBCmodel_Pred))
print('F1 score : ', f1_score(Y_test, GBCmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, GBCmodel_Pred))


Train Score :  1.0
Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# 6 - XGB Classifier Model

In [32]:
import xgboost as xgb
XGBmodel = xgb.XGBClassifier()
XGBmodel.fit(X_train, Y_train)
XGBmodel_Pred = XGBmodel.predict(X_test)
print("Train Score : ",XGBmodel.score(X_train,Y_train))
print(" Test Score : ",XGBmodel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, XGBmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, XGBmodel_Pred))
print('F1 score : ', f1_score(Y_test, XGBmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, XGBmodel_Pred))


Train Score :  1.0
 Test Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# 7 - SVC Model

In [33]:
from sklearn.svm import SVC 
SVCmodel = SVC()
SVCmodel.fit(X_train, Y_train)
SVCmodel_Pred = SVCmodel.predict(X_test)
print("Train Score : ",SVCmodel.score(X_train,Y_train))
print("Test Score : ",SVCmodel.score(X_test,Y_test))
#print("Accuracy_score : ",SVCmodel_Pred(Y_test, SVCmodel_Pred,average='macro'))
#print('Confusion Matrix:\n', SVCmodel_Pred(Y_test, LRmodel_Pred))
print('F1 score : ', f1_score(Y_test, SVCmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, SVCmodel_Pred))


Train Score :  0.9663865546218487
Test Score :  0.9666666666666667
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# Hyper Parameter tuning of Support Vector Classifier 

In [34]:
parameter = {
    'C': [0.1, 1, 3, 10],          
    'gamma': [0.001, 0.01, 0.1],
    'degree': [2, 3, 4, 5],
    'kernel': ['linear','poly','rbf','sigmoid']
}



cv=GridSearchCV(SVCmodel,parameter,scoring='accuracy')
cv.fit(X_train,Y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",cv.best_estimator_)
print("\n The best score across ALL searched params:\n",cv.best_score_)
print("\n The best parameters across ALL searched params:\n",cv.best_params_)

cv_model=cv.best_estimator_
cv_model.fit(X_train, Y_train)
SVCmodel_Pred = cv_model.predict(X_test)


print("Score : ",cv_model.score(X_test,Y_test))
#print("Accuracy_score : ",accuracy_score(Y_test, SVCmodel_Pred))
#print('Confusion Matrix:\n', confusion_matrix(Y_test, SVCmodel_Pred))
print('F1 score : ', f1_score(Y_test, SVCmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, SVCmodel_Pred))


 Results from Grid Search 

 The best estimator across ALL searched params:
 SVC(C=1, degree=2, gamma=0.001, kernel='linear')

 The best score across ALL searched params:
 0.975

 The best parameters across ALL searched params:
 {'C': 1, 'degree': 2, 'gamma': 0.001, 'kernel': 'linear'}
Score :  0.9666666666666667
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# 8 - KNeighbors Classifier Model

In [35]:
from sklearn.neighbors import KNeighborsClassifier
KNNmodel = KNeighborsClassifier()
KNNmodel.fit(X_train, Y_train)
KNNmodel_Pred = KNNmodel.predict(X_test)
print("Train Score : ",KNNmodel.score(X_train,Y_train))
print("Test Score : ",KNNmodel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, KNNmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, KNNmodel_Pred))
print('F1 score : ', f1_score(Y_test, KNNmodel_Pred, average='macro'))
print('Classification Report : \n', classification_report(Y_test, KNNmodel_Pred))


Train Score :  0.9747899159663865
Test Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# Hyper Parameter tuning of KNeighbors Classifier

In [36]:
parameter={
    'n_neighbors': [3, 5, 7, 9, 11], 
    'weights': ['uniform', 'distance',None],  
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']
    
}


KNNmodel = KNeighborsClassifier()

cv=GridSearchCV(KNNmodel,parameter,scoring='accuracy')
cv.fit(X_train,Y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",cv.best_estimator_)
print("\n The best score across ALL searched params:\n",cv.best_score_)
print("\n The best parameters across ALL searched params:\n",cv.best_params_)

cv_model=cv.best_estimator_
cv_model.fit(X_train, Y_train)
KNNmodel_Pred = cv_model.predict(X_test)



print("Score : ",cv_model.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, KNNmodel_Pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, KNNmodel_Pred))
print('F1 score : ', f1_score(Y_test, KNNmodel_Pred, average='macro'))
print(' Classification Report : \n', classification_report(Y_test, KNNmodel_Pred))

[WinError 2] The system cannot find the file specified
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1024, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\Mathew\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1493, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


 Results from Grid Search 

 The best estimator across ALL searched params:
 KNeighborsClassifier(n_neighbors=9, weights='distance')

 The best score across ALL searched params:
 0.975

 The best parameters across ALL searched params:
 {'algorithm': 'auto', 'n_neighbors': 9, 'weights': 'distance'}
Score :  0.9666666666666667
Accuracy_score :  0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
F1 score :  0.9658994032395567
 Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



# 9 - Naive Bayes

In [38]:
from sklearn.naive_bayes import GaussianNB
NBCModel = GaussianNB()
NBCModel.fit(X_train, Y_train)
y_pred = NBCModel.predict(X_test)
#print(y_pred)
#print(Y_test)
mislabel = np.sum(Y_test != y_pred)
print("Total no of mislabelled datapoints :",mislabel)

print("Score : ",NBCModel.score(X_test,Y_test))
print("Accuracy_score : ",accuracy_score(Y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, y_pred))
print('F1 score : ', f1_score(Y_test, y_pred, average='macro'))
print(' Classification Report : \n',classification_report(y_pred, Y_test))

Total no of mislabelled datapoints : 0
Score :  1.0
Accuracy_score :  1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
F1 score :  1.0
 Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

