### Imports

In [None]:
from si.io.csv_file import read_csv
from si.model_selection.split import stratified_train_test_split


from si.io.csv_file import read_csv              
from si.model_selection.split import train_test_split
from si.feature_selection.select_percentile import SelectPercentile
from si.statistics.f_classification import f_classification
from si.models.knn_classifier import KNNClassifier
from si.metrics.accuracy import accuracy

from si.io.csv_file import read_csv
from si.decomposition.pca import PCA

import numpy as np
import pandas as pd
from si.data.dataset import Dataset

### EX 1.1

In [13]:
dataset = read_csv("../datasets/iris/iris.csv", sep = ",", features = True, label = True )

### EX 1.2

In [10]:
penul_var = dataset.X[:,-2]

print(penul_var.shape)

(150,)


### EX 1.3

In [20]:
last_10_sample = dataset.X[-10:]

print(last_10_sample)

np.nanmean(last_10_sample, axis=0)


[[6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]


array([6.45, 3.03, 5.33, 2.17])

### EX 1.4

In [66]:
less_equal_6 = np.all(dataset.X <= 6, axis=1) 

print(np.count_nonzero(less_equal_6==True))




89


### EX 1.5

In [None]:
mask = dataset.y !='Iris-setosa'

print(np.count_nonzero(dataset.y[mask]))

100


### EX 2.1


In [7]:

# Create dataset with NaN values
dataset = Dataset(
    X=np.array([[1.0, 2.0], [np.nan, 3.0], [4.0, 5.0]]),
    y=np.array([0, 1, 0]),
    features=["feat1", "feat2"],
    label="y"
)

print("Before dropna:")
print("Shape:", dataset.X.shape)
print("X:\n", dataset.X)

dataset.dropna()

print("\nAfter dropna:")
print("Shape:", dataset.X.shape)
print("X:\n", dataset.X)

Before dropna:
Shape: (3, 2)
X:
 [[ 1.  2.]
 [nan  3.]
 [ 4.  5.]]

After dropna:
Shape: (2, 2)
X:
 [[1. 2.]
 [4. 5.]]


### EX 2.2


In [6]:
# Create dataset with NaN values
dataset = Dataset(
    X=np.array([[1.0, 2.0], [np.nan, 3.0], [4.0, 5.0]]),
    y=np.array([0, 1, 0]),
    features=["feat1", "feat2"],
    label="y"
)

print("Before fillna('mean'):")
print("X:\n", dataset.X)

dataset.fillna("mean")

print("\nAfter fillna('mean'):")
print("X:\n", dataset.X)

Before fillna('mean'):
X:
 [[ 1.  2.]
 [nan  3.]
 [ 4.  5.]]

After fillna('mean'):
X:
 [[1.  2. ]
 [2.5 3. ]
 [4.  5. ]]


### EX 2.3


In [8]:

# Create dataset
dataset = Dataset(
    X=np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]),
    y=np.array([0, 1, 0]),
    features=["feat1", "feat2"],
    label="y"
)

print("Before remove_by_index(1):")
print("Shape:", dataset.X.shape)
print("y:", dataset.y)

dataset.remove_by_index(1)

print("\nAfter remove_by_index(1):")
print("Shape:", dataset.X.shape)
print("y:", dataset.y)

Before remove_by_index(1):
Shape: (3, 2)
y: [0 1 0]

After remove_by_index(1):
Shape: (2, 2)
y: [0 0]


### EX 3.3 - Select Percentile

In [19]:
# 1) Load iris.csv into a Dataset
dataset = read_csv("../datasets/iris/iris.csv", sep = ",", features = True, label = True )

# 2) Stratified train/test split
train_ds, test_ds = train_test_split(dataset, test_size=0.3, random_state=42)

# 3) Fit SelectPercentile on the training set
selector = SelectPercentile(score_func=f_classification, percentile=50)
selector.fit(train_ds)

# 4) Transform X and build new Dataset objects
X_train_sel = selector.transform(train_ds)   # np.ndarray with selected features
X_test_sel = selector.transform(test_ds)

train_reduced = Dataset(X=X_train_sel, y=train_ds.y, features=None, label=train_ds.label)
test_reduced = Dataset(X=X_test_sel, y=test_ds.y, features=None, label=test_ds.label)

# 5) Train KNN on reduced dataset
knn = KNNClassifier(k=5)
knn.fit(train_reduced)

# 6) Evaluate
y_pred = knn.predict(test_reduced)
score = accuracy(test_reduced.y, y_pred)
print("Accuracy with SelectPercentile + KNN:", score)

Accuracy with SelectPercentile + KNN: 1.0


### EX 5.2 - PCA

In [21]:
# 1) Load iris.csv as Dataset
dataset = read_csv("../datasets/iris/iris.csv", sep = ",", features = True, label = True )

# 2) Fit PCA with 2 components
pca = PCA(n_components=2)
pca.fit(dataset)

# 3) Transform dataset
X_reduced = pca.transform(dataset)

print("Original shape:", dataset.X.shape)
print("Reduced shape:", X_reduced.shape)
print("Explained variance (2 comps):", pca.explained_variance)


Original shape: (150, 4)
Reduced shape: (150, 2)
Explained variance (2 comps): [0.92461621 0.05301557]


### EX 6.2 - Stratified Train Test Split

In [15]:
# 1. Load the iris.csv (adapt path if needed)
# Example uses standard iris CSV format with columns: sepal.length, sepal.width, petal.length, petal.width, class
dataset = read_csv("../datasets/iris/iris.csv", sep = ",", features = True, label = True )  

# 2. Separate features and labels
# Adjust column names to match your CSV format
X = df.drop("class", axis=1).to_numpy()
y = df["class"].to_numpy()
features = df.drop("class", axis=1).columns.tolist()
label = "class"

# 3. Wrap in your Dataset object
iris_dataset = Dataset(X, y, features, label)

# 4. Split using your function
train, test = stratified_train_test_split(iris_dataset, test_size=0.2, random_state=42)

# 5. Print class distribution to check stratification
train_classes, train_counts = np.unique(train.y, return_counts=True)
test_classes, test_counts = np.unique(test.y, return_counts=True)

print("Train set class distribution:")
for cls, count in zip(train_classes, train_counts):
    print(f"{cls}: {count}")
    
print("\nTest set class distribution:")
for cls, count in zip(test_classes, test_counts):
    print(f"{cls}: {count}")

print("\nTrain size:", len(train.y))
print("Test size:", len(test.y))


NameError: name 'df' is not defined

In [None]:
from si.io.csv_file import read_csv
from si.model_selection.split import train_test_split
from si.models.random_forest_classifier import RandomForestClassifier

iris = read_csv("../datasets/iris/iris.csv", sep=",", features=True, label=True)
train_ds, test_ds = train_test_split(iris, test_size=0.2, random_state=42)

rf = RandomForestClassifier(
    nestimators=50,
    maxfeatures=None,
    minsamplesplit=2,
    maxdepth=10,
    mode="gini",
    seed=42
)

rf.fit(train_ds)
score = rf.score(test_ds)
print("Accuracy:", score)



TypeError: Can't instantiate abstract class RandomForestClassifier without an implementation for abstract methods '_fit', '_predict', '_score'