In [1]:
import numpy as np
import pandas as pd
import sklearn 
import os 

In [2]:
# load the dataset 

col_names = ["sepal length cm", "sepal width cm", "petal length cm", "petal width cm", "class"]

df = pd.read_csv('/home/zhan/Downloads/iris/bezdekIris.data', header=None, names=col_names)
df.head()

Unnamed: 0,sepal length cm,sepal width cm,petal length cm,petal width cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
"""
Strategy:
1. one hot encode the class names 
2. check for null values, invalid values for sepal and petal length and width 
3. train test split
4. impute missing values if any 
5. check for outliers and impute them. 
"""

In [3]:
df.describe()

Unnamed: 0,sepal length cm,sepal width cm,petal length cm,petal width cm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sepal length cm  150 non-null    float64
 1   sepal width cm   150 non-null    float64
 2   petal length cm  150 non-null    float64
 3   petal width cm   150 non-null    float64
 4   class            150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.isnull().sum()

sepal length cm    0
sepal width cm     0
petal length cm    0
petal width cm     0
class              0
dtype: int64

In [6]:
# check class names 
print(df["class"].unique())

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [7]:
# one hot encode the class names 
# two columns,Species_Iris-versicolor and Species_Iris-virginica.
# Species_Iris-setosa is then represented by 0 in both new columns.

df = pd.get_dummies(df, columns=['class'], prefix='Species', drop_first=True)

# turn the target variable into float 
species_cols = [col for col in df.columns if col.startswith('Species_')]
df[species_cols] = df[species_cols].astype(float)

In [8]:
df.head()

Unnamed: 0,sepal length cm,sepal width cm,petal length cm,petal width cm,Species_Iris-versicolor,Species_Iris-virginica
0,5.1,3.5,1.4,0.2,0.0,0.0
1,4.9,3.0,1.4,0.2,0.0,0.0
2,4.7,3.2,1.3,0.2,0.0,0.0
3,4.6,3.1,1.5,0.2,0.0,0.0
4,5.0,3.6,1.4,0.2,0.0,0.0


In [9]:
# check for invalid values and impute any of them with the mean or median 

filter_mask = df["sepal length cm"] < 0
print(filter_mask.sum())
filter_mask = df["sepal width cm"] < 0
print(filter_mask.sum())
filter_mask = df["petal length cm"] < 0
print(filter_mask.sum())
filter_mask = df["petal width cm"] < 0
print(filter_mask.sum())

0
0
0
0


In [10]:
# no invalid values, train test split & check for outliers 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

species_cols = [col for col in df.columns if col.startswith('Species_')]

X = df.drop(columns=species_cols)
Y = df[species_cols]

# train test split with y dataframes 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
# impute outliers 
for col in list(X_train.columns):
    q1 = X_train[col].quantile(0.25)
    q3 = X_train[col].quantile(0.75)
    iqr = q3 - q1

    upper_bound = q3 + (1.5 * iqr)
    lower_bound = q1 - (1.5 * iqr)

    # cap using IQR bounds: value > upper_bound = upper_bound, value < lower_bound = lower_bound 
    X_train[col] = np.where(X_train[col] > upper_bound, upper_bound, X_train[col])
    X_train[col] = np.where(X_train[col] < lower_bound, lower_bound, X_train[col])

    X_test[col] = np.where(X_test[col] > upper_bound, upper_bound, X_test[col])
    X_test[col] = np.where(X_test[col] < lower_bound, lower_bound, X_test[col])

In [12]:
print(X_test.dtypes)

sepal length cm    float64
sepal width cm     float64
petal length cm    float64
petal width cm     float64
dtype: object


In [13]:
# standardize 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train) # fit the training data on all numerical columns of X_train 

# 3. TRANSFORM the training data
X_train_scaled = scaler.transform(X_train)

# 4. TRANSFORM the test data using the *same* mean and std calculated in step 2
X_test_scaled = scaler.transform(X_test)


In [15]:
# convert x train and x test back to df 
X_train_scaled_df = pd.DataFrame(
    X_train_scaled,
    columns=list(X_train.columns),    # Use the column names you scaled
    index=X_train.index        # Use the original index to match rows
)

X_test_scaled_df = pd.DataFrame(
    X_test_scaled,
    columns=list(X_test.columns),    # Use the column names you scaled
    index=X_test.index        # Use the original index to match rows
)

In [18]:
# export as csv 
X_train_scaled_df.to_csv("iris_X_train_processed.csv", index=False)
X_test_scaled_df.to_csv("iris_X_test_processed.csv", index=False)
Y_train.to_csv("iris_y_train.csv", index=False)
Y_test.to_csv("iris_y_test.csv", index=False)