In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
data = pd.read_csv('IRIS.csv')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
data.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [6]:
data['species'].value_counts()

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [8]:
data.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


# Encoding

In [9]:
encoder = OneHotEncoder()

encoded = encoder.fit_transform(data[['species']]).toarray()
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['species']))
data = pd.concat([data, encoded_df], axis=1)
data.drop(columns=['species'], inplace=True)
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_Iris-setosa,species_Iris-versicolor,species_Iris-virginica
0,5.1,3.5,1.4,0.2,1.0,0.0,0.0
1,4.9,3.0,1.4,0.2,1.0,0.0,0.0
2,4.7,3.2,1.3,0.2,1.0,0.0,0.0
3,4.6,3.1,1.5,0.2,1.0,0.0,0.0
4,5.0,3.6,1.4,0.2,1.0,0.0,0.0


In [10]:
data.isnull().sum()

sepal_length               0
sepal_width                0
petal_length               0
petal_width                0
species_Iris-setosa        0
species_Iris-versicolor    0
species_Iris-virginica     0
dtype: int64

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sepal_length             150 non-null    float64
 1   sepal_width              150 non-null    float64
 2   petal_length             150 non-null    float64
 3   petal_width              150 non-null    float64
 4   species_Iris-setosa      150 non-null    float64
 5   species_Iris-versicolor  150 non-null    float64
 6   species_Iris-virginica   150 non-null    float64
dtypes: float64(7)
memory usage: 8.3 KB


In [12]:
data.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_Iris-setosa,species_Iris-versicolor,species_Iris-virginica
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,0.333333,0.333333,0.333333
std,0.828066,0.433594,1.76442,0.763161,0.472984,0.472984,0.472984
min,4.3,2.0,1.0,0.1,0.0,0.0,0.0
25%,5.1,2.8,1.6,0.3,0.0,0.0,0.0
50%,5.8,3.0,4.35,1.3,0.0,0.0,0.0
75%,6.4,3.3,5.1,1.8,1.0,1.0,1.0
max,7.9,4.4,6.9,2.5,1.0,1.0,1.0


# ML

In [18]:
X = data.drop(columns=['species_Iris-setosa', 'species_Iris-versicolor', 'species_Iris-virginica'])
y = data[['species_Iris-setosa', 'species_Iris-versicolor', 'species_Iris-virginica']]
y_single = np.argmax(y.values, axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y_single, test_size=0.2, random_state=42, stratify=y_single)

# Create a logistic regression model
model_lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')

model_lr.fit(X_train, y_train)

# Predict the species for the test set
y_pred = model_lr.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))




Model accuracy: 0.97
Confusion Matrix: [[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [30]:
# test the model with a new sample
new_sample = np.array([[11.2, 31.5, 11.4, 10.2]])
new_sample = pd.DataFrame(new_sample, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
# Predict using the same features as X
predicted_species_idx = model_lr.predict(new_sample)
print(f"Predicted species index for the new sample: {predicted_species_idx}")
species_labels = encoder.categories_[0]  # ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
print(f"Predicted species labels: {species_labels}")
predicted_species = species_labels[predicted_species_idx][0]
print(f"Predicted species for the new sample: {predicted_species}")

Predicted species index for the new sample: [2]
Predicted species labels: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
Predicted species for the new sample: Iris-virginica
