# Exercise 1: NumPy array and Indexing/Slicing

• 1.1) In this exercise, we will use the iris dataset. Load
the "iris.csv" using the appropriate method for this file
type (use the new functions from the package).

In [1]:
from si.io.csv_file import read_csv

iris = read_csv("../datasets/iris/iris.csv", features=True, label=True)

print(f"Shape: {iris.shape()}\n")
print(f"Features: {iris.features}\n")
print(f"Has Label?: {iris.has_label()}\n")
print(f"Label/Classes: {iris.get_classes()}\n")
print(f"Dataset Summary:\n {iris.summary()}\n")
print(f"Array:\n {iris.X}")

Shape: (150, 4)

Features: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')

Has Label?: True

Label/Classes: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

Dataset Summary:
         sepal_length  sepal_width  petal_length  petal_width
mean        5.843333     3.054000      3.758667     1.198667
median      5.800000     3.000000      4.350000     1.300000
min         4.300000     2.000000      1.000000     0.100000
max         7.900000     4.400000      6.900000     2.500000
var         0.681122     0.186751      3.092425     0.578532

Array:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7

• 1.2) Select the penultimate independent variable.
What is the dimension of the resulting array?

In [2]:
import numpy as np

penult_variable = iris.X[:, -2] # Select penultimate independent variable.

print(f"Penultimate independent variable:\n {penult_variable} \n")
print(f"Number of dimensions of the array (via numpy.ndarray.ndim): {penult_variable.ndim}")
print(f"Shape of the array (via numpy.ndarray.shape): {penult_variable.shape}")

Penultimate independent variable:
 [1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3 1.4
 1.7 1.5 1.7 1.5 1.  1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4 1.5 1.2
 1.3 1.5 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7 4.5 4.9 4.
 4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4.  4.7 3.6 4.4 4.5 4.1 4.5 3.9 4.8 4.
 4.9 4.7 4.3 4.4 4.8 5.  4.5 3.5 3.8 3.7 3.9 5.1 4.5 4.5 4.7 4.4 4.1 4.
 4.4 4.6 4.  3.3 4.2 4.2 4.2 4.3 3.  4.1 6.  5.1 5.9 5.6 5.8 6.6 4.5 6.3
 5.8 6.1 5.1 5.3 5.5 5.  5.1 5.3 5.5 6.7 6.9 5.  5.7 4.9 6.7 4.9 5.7 6.
 4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9
 5.7 5.2 5.  5.2 5.4 5.1] 

Number of dimensions of the array (via numpy.ndarray.ndim): 1
Shape of the array (via numpy.ndarray.shape): (150,)


In [3]:
# Using the functions present within our package:
from si.data.dataset import Dataset

penult_variable = iris.X[:, -2] # Select penultimate independent variable.

# Converting penultimate independent variable into a dataset to use our .shape() function.
penult_variable_dataset = Dataset(X=penult_variable.reshape(-1, 1), y=None, features=[iris.features[-2]], label=None) # Note: The .reshape(-1, 1) function is used to maintain a 2D format.
print(f"Shape of the array: {penult_variable_dataset.shape()}")
print(f"Number of dimensions of the array (via numpy.ndarray.ndim): {penult_variable.ndim}")

Shape of the array: (150, 1)
Number of dimensions of the array (via numpy.ndarray.ndim): 1


• 1.3) Select the last 10 samples from the iris dataset.
What is the mean of the last 10 samples for each
independent variable/feature?

In [4]:
last_10_samples = iris.X[-10:, :]
mean_last_10 = last_10_samples.mean(axis=0)
print(f"Mean of the last 10 samples for each independent variable: {mean_last_10}")

Mean of the last 10 samples for each independent variable: [6.45 3.03 5.33 2.17]


In [5]:
# Using the functions present within our package:
from si.data.dataset import Dataset

last_10_samples_X = iris.X[-10:, :] # Select last 10 rows from the feature matrix.
last_10_samples_y = iris.y[-10:]    # Select last 10 labels.

# Converting last 10 independent variables into a dataset to use our .get_mean() function.
last_10_dataset = Dataset(X=last_10_samples_X, y=last_10_samples_y, features=iris.features, label=iris.label)
print(f"Mean of the last 10 samples for each independent variable: {last_10_dataset.get_mean()}")

Mean of the last 10 samples for each independent variable: [6.45 3.03 5.33 2.17]


• 1.4) Select all samples from the dataset with values
less than or equal to 6 for all independent
variables/features. How many samples do you obtain?

In [6]:
import numpy as np

condition = np.all(iris.X <= 6, axis=1) # np.all allows the selection of all values that follow the condition given for the "a" parameter.
filtered_samples = iris.X[condition]
print(f"Samples with values less than or equal to 6 in the dataset: {filtered_samples.shape[0]}")

# # Also works simply just using:
# filtered_samples_alt = iris.X[np.all(iris.X <= 6, axis=1)]
# print(f"Samples with values less than or equal to 6 in the dataset: {filtered_samples_alt.shape[0]}")

Samples with values less than or equal to 6 in the dataset: 89


• 1.5) Select all samples with a class/label different
from 'Iris-setosa'. How many samples do you obtain?

In [7]:
condition = iris.y != "Iris-setosa"
filtered_samples = iris.X[condition]
print(f"Number of samples with a class/label different from 'Iris-setosa': {filtered_samples.shape[0]}")

# # Also works simply just using:
# filtered_samples_alt = iris.X[iris.y != 'Iris-setosa']
# print(f"Number of samples with a class/label different from 'Iris-setosa': {filtered_samples_alt.shape[0]}")

Number of samples with a class/label different from 'Iris-setosa': 100


# Exercise 2: NumPy array Indexing/Slicing
Usage examples of dropna(), fillna() and remove_by_index(). 

Functions can be found in the Dataset class object, directory .\src\si\data\dataset.py.

### dropna() usage example

In [8]:
import numpy as np
from si.data.dataset import Dataset

# Creating an example dataset with NaN values
dataset = Dataset(X = np.array([[55, 178, 72], [np.nan, 172, 80], [31, np.nan, 92], [26, 176, 75]]),
                  y = np.array([1, 2, 3, 4]), 
                  features = ['Age', 'Height (cm)', 'Weight (kg)'], 
                  label = 'SampleID')  

dataset.to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,,172.0,80.0,2
2,31.0,,92.0,3
3,26.0,176.0,75.0,4


In [9]:
# Use of .dropna() function to reveal dataset with the rows with NaN values removed
dataset.dropna().to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,26.0,176.0,75.0,4


### fillna() usage example

In [10]:
import numpy as np
from si.data.dataset import Dataset

# Creating an example dataset with NaN values
dataset = Dataset(X = np.array([[55, 178, 72], [np.nan, 172, 80], [31, np.nan, 92], [26, 176, 75]]),
                  y = np.array([1, 2, 3, 4]), 
                  features = ['Age', 'Height (cm)', 'Weight (kg)'], 
                  label = 'SampleID')  

dataset.to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,,172.0,80.0,2
2,31.0,,92.0,3
3,26.0,176.0,75.0,4


In [11]:
# Applying .fillna() function with value parameter on the default setting (value='mean').
dataset.fillna().to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,37.333333,172.0,80.0,2
2,31.0,175.333333,92.0,3
3,26.0,176.0,75.0,4


In [12]:
# Applying .fillna() with value parameter manually set to 'mean', the result should be the same as the previous python cell.

# Resetting the dataset back to it's original undisturbed state
dataset = Dataset(X = np.array([[55, 178, 72], [np.nan, 172, 80], [31, np.nan, 92], [26, 176, 75]]),
                  y = np.array([1, 2, 3, 4]), 
                  features = ['Age', 'Height (cm)', 'Weight (kg)'], 
                  label = 'SampleID')  

dataset.fillna(value='mean').to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,37.333333,172.0,80.0,2
2,31.0,175.333333,92.0,3
3,26.0,176.0,75.0,4


In [13]:
# Applying .fillna() with value parameter manually set to 'median'.

# Resetting the dataset back to it's original undisturbed state
dataset = Dataset(X = np.array([[55, 178, 72], [np.nan, 172, 80], [31, np.nan, 92], [26, 176, 75]]),
                  y = np.array([1, 2, 3, 4]), 
                  features = ['Age', 'Height (cm)', 'Weight (kg)'], 
                  label = 'SampleID')  

dataset.fillna(value='median').to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,31.0,172.0,80.0,2
2,31.0,176.0,92.0,3
3,26.0,176.0,75.0,4


In [14]:
# Applying .fillna() with value parameter manually set to a float value.

# Resetting the dataset back to it's original undisturbed state
dataset = Dataset(X = np.array([[55, 178, 72], [np.nan, 172, 80], [31, np.nan, 92], [26, 176, 75]]),
                  y = np.array([1, 2, 3, 4]), 
                  features = ['Age', 'Height (cm)', 'Weight (kg)'], 
                  label = 'SampleID')    

dataset.fillna(value=50.5).to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,50.5,172.0,80.0,2
2,31.0,50.5,92.0,3
3,26.0,176.0,75.0,4


In [15]:
# Applying .fillna() with value parameter manually set to an integer value.

# Resetting the dataset back to it's original undisturbed state
dataset = Dataset(X = np.array([[55, 178, 72], [np.nan, 172, 80], [31, np.nan, 92], [26, 176, 75]]),
                  y = np.array([1, 2, 3, 4]), 
                  features = ['Age', 'Height (cm)', 'Weight (kg)'], 
                  label = 'SampleID')    

dataset.fillna(value=50).to_dataframe() # Should convert integer value into a float

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,50.0,172.0,80.0,2
2,31.0,50.0,92.0,3
3,26.0,176.0,75.0,4


### remove_by_index() usage example

In [16]:
import numpy as np
from si.data.dataset import Dataset

# Creating an example dataset with NaN values
dataset = Dataset(X = np.array([[55, 178, 72], [np.nan, 172, 80], [31, np.nan, 92], [26, 176, 75]]),
                  y = np.array([1, 2, 3, 4]), 
                  features = ['Age', 'Height (cm)', 'Weight (kg)'], 
                  label = 'SampleID')  

dataset.to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,55.0,178.0,72.0,1
1,,172.0,80.0,2
2,31.0,,92.0,3
3,26.0,176.0,75.0,4


In [17]:
# Removing index 0. 
# Should automatically update the indices so index 1 becomes index 0 and so on (row where SampleID value was 1 should have been removed).
dataset.remove_by_index(0).to_dataframe()

Unnamed: 0,Age,Height (cm),Weight (kg),SampleID
0,,172.0,80.0,2
1,31.0,,92.0,3
2,26.0,176.0,75.0,4


# Exercise 3: Implementing SelectPercentile

3.3) Test the SelectPercentile class in a Jupyter notebook using the "iris.csv" dataset (classification).

SelectPercentile class is present in the directory: .\src\si\feature_selection\select_percentile.py

In [18]:
import numpy as np
import os

from si.feature_selection.select_percentile import SelectPercentile
from si.io.csv_file import read_csv
from si.statistics.f_classification import f_classification

dataset = read_csv("../datasets/iris/iris.csv", features=True, label=True) # Defining the iris dataset as a variable 'dataset'

In [19]:
select_percentile = SelectPercentile(score_func=f_classification, percentile=50)    # Creates SelectPercentile instance with 50% of features selected based on f_classification
select_percentile.fit(iris)                                                         # Fit SelectPercentile to the iris dataset
transformed_dataset = select_percentile.transform(dataset)                          # Transform the dataset by selecting the best features according to the percentile

print(f"Features before SelectPercentile transformation: {dataset.features} \n")    # Display features present in the iris dataset before being transformed by SelectPercentile

# Display the transformed features and data
print(f"Selected features after SelectPercentile transformation: {transformed_dataset.features} \n")
print(f"First five rows of the transformed data:\n{transformed_dataset.X[:5]} \n")
print(f"Number of features before transformation: {len(iris.features)}")
print(f"Number of features after transformation: {len(transformed_dataset.features)}")

Features before SelectPercentile transformation: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object') 

Selected features after SelectPercentile transformation: ['petal_width', 'petal_length'] 

First five rows of the transformed data:
[[0.2 1.4]
 [0.2 1.4]
 [0.2 1.3]
 [0.2 1.5]
 [0.2 1.4]] 

Number of features before transformation: 4
Number of features after transformation: 2
