In [2]:
from pprint import pprint
from sklearn.datasets import load_iris
import polars as pl

In [4]:
iris = load_iris()

In [5]:
df = pl.DataFrame({
    "sepal_length": iris.data[:, 0],
    "sepal_width": iris.data[:, 1],
    "petal_length": iris.data[:, 2],
    "petal_width": iris.data[:, 3],
    "species": iris.target
})

In [6]:
df.head(2)

sepal_length,sepal_width,petal_length,petal_width,species
f64,f64,f64,f64,i64
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0


In [7]:
df.schema

Schema([('sepal_length', Float64),
        ('sepal_width', Float64),
        ('petal_length', Float64),
        ('petal_width', Float64),
        ('species', Int64)])

In [8]:
df.describe()

statistic,sepal_length,sepal_width,petal_length,petal_width,species
str,f64,f64,f64,f64,f64
"""count""",150.0,150.0,150.0,150.0,150.0
"""null_count""",0.0,0.0,0.0,0.0,0.0
"""mean""",5.843333,3.057333,3.758,1.199333,1.0
"""std""",0.828066,0.435866,1.765298,0.762238,0.819232
"""min""",4.3,2.0,1.0,0.1,0.0
"""25%""",5.1,2.8,1.6,0.3,0.0
"""50%""",5.8,3.0,4.4,1.3,1.0
"""75%""",6.4,3.3,5.1,1.8,2.0
"""max""",7.9,4.4,6.9,2.5,2.0


In [9]:
df.null_count()

sepal_length,sepal_width,petal_length,petal_width,species
u32,u32,u32,u32,u32
0,0,0,0,0


In [10]:
# Fill missing values with the mean of the column
df_filled = df.fill_null(strategy="mean")

In [11]:
df_filled.head(2)

sepal_length,sepal_width,petal_length,petal_width,species
f64,f64,f64,f64,i64
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0


In [13]:
# Normalize the feature columns using z-score
# The formula is z = (x - mean) / std
# each column will substract the mean and divide by the standard deviation
df_normalized = df.with_columns([
  (pl.col("sepal_length") - pl.col("sepal_length").mean()) / pl.col("sepal_length").std(),
  (pl.col("sepal_width") - pl.col("sepal_width").mean()) / pl.col("sepal_width").std(),
  (pl.col("petal_length") - pl.col("petal_length").mean()) / pl.col("petal_length").std(),
  (pl.col("petal_width") - pl.col("petal_width").mean()) / pl.col("petal_width").std()
])

df_normalized.head(3)

sepal_length,sepal_width,petal_length,petal_width,species
f64,f64,f64,f64,i64
-0.897674,1.015602,-1.335752,-1.311052,0
-1.1392,-0.131539,-1.335752,-1.311052,0
-1.380727,0.327318,-1.392399,-1.311052,0


In [15]:
#divide the data into training and testing sets
from sklearn.model_selection import train_test_split

# split the data to x and y
x = df_normalized.select(["sepal_length", "sepal_width", "petal_length", "petal_width"]).to_numpy()
y = df_normalized.select(["species"]).to_numpy().flatten()

In [16]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
#let's create a part to classification problems 
from sklearn.neighbors import KNeighborsClassifier

# create a model knn
knn = KNeighborsClassifier(n_neighbors=3)

# train the model
knn.fit(X_train, y_train)

In [18]:
#lets make a evaluation accuracy of our model
from sklearn.metrics import accuracy_score, classification_report

# make predictions on the test set
y_pred = knn.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [22]:
# show better details with classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

