In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler  # Import MinMaxScaler

In [4]:
df = pd.read_csv('Iris.csv')
df.head()#To print only first 5 values from dataset

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# Get a summary of the dataset
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [6]:
# Get a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [7]:
# Get the shape of the DataFrame
df.shape

(150, 6)

In [8]:
# Get the number of rows and columns in the DataFrame
df.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [9]:
# Convert categorical variables into dummy/indicator variables
print(df.columns)  # Get the column names
df.columns = df.columns.str.strip()  # Remove leading and trailing whitespace from column names

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [11]:
df.dtypes# Get the data types of each column
#Turn categorical variables into quantitative variables in Python.
# Convert categorical variables into dummy/indicator variables

Id                  int64
SepalLengthCm     float64
SepalWidthCm      float64
PetalLengthCm     float64
PetalWidthCm      float64
Species          category
dtype: object

In [13]:
print(df.columns)# Get the column names
df.columns = df.columns.str.strip()# Remove leading and trailing whitespace from column names

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [14]:
df['Species'] = df['Species'].astype('category')# Convert 'Species' to a categorical variable
df['Species_cat'] = df['Species'].cat.codes
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_cat
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0


In [15]:
iris_encoded = pd.get_dummies(df, columns=['Species'])#pd.get_dummies() helps convert categorical data into numerical format, which is essential for most ML tasks.
iris_encoded.head() 

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_cat,Species_Iris-setosa,Species_Iris-versicolor,Species_Iris-virginica
0,1,5.1,3.5,1.4,0.2,0,True,False,False
1,2,4.9,3.0,1.4,0.2,0,True,False,False
2,3,4.7,3.2,1.3,0.2,0,True,False,False
3,4,4.6,3.1,1.5,0.2,0,True,False,False
4,5,5.0,3.6,1.4,0.2,0,True,False,False


In [16]:
# Step to add MinMaxScaler for data normalization

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

In [17]:
# Select numerical columns to normalize (excluding the 'Species' related columns)
numerical_cols = iris_encoded.select_dtypes(include=[np.number]).columns

In [18]:
# Apply MinMaxScaler to scale the data between 0 and 1
iris_encoded[numerical_cols] = scaler.fit_transform(iris_encoded[numerical_cols])

In [19]:
# Display the normalized dataset
print(iris_encoded.head())

         Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0  0.000000       0.222222      0.625000       0.067797      0.041667   
1  0.006711       0.166667      0.416667       0.067797      0.041667   
2  0.013423       0.111111      0.500000       0.050847      0.041667   
3  0.020134       0.083333      0.458333       0.084746      0.041667   
4  0.026846       0.194444      0.666667       0.067797      0.041667   

   Species_cat  Species_Iris-setosa  Species_Iris-versicolor  \
0          0.0                 True                    False   
1          0.0                 True                    False   
2          0.0                 True                    False   
3          0.0                 True                    False   
4          0.0                 True                    False   

   Species_Iris-virginica  
0                   False  
1                   False  
2                   False  
3                   False  
4                   False  
