# Chapter 2: Scale Machine Learning Data (Tensor Version)

In [1]:
# Load libraries
use strict;
use warnings;
use Data::Dump qw(dump);
use List::Util qw(sum);
use AI::MXNet qw(mx);
use sml;

In [2]:
# Function To Calculate the Min and Max Values For a Dataset with tensor support
sub dataset_minmax {
    my ($self, $dataset) = @_;
    
    if (ref($dataset) eq 'AI::MXNet::NDArray') {
        my $mins = $dataset->min(axis => 0);
        my $maxs = $dataset->max(axis => 0);
        return [$mins, $maxs];
    } 
    elsif (ref($dataset) eq 'ARRAY') {
        my @minmax;
        for my $i (0 .. $#{$dataset->[0]}) {
            my $col_values = [map {$_->[$i]} @$dataset];
            my $value_min = min(@$col_values);
            my $value_max = max(@$col_values);
            push @minmax, [$value_min, $value_max];
        }
        return \@minmax;
    }
}

sml->add_to_class('dataset minmax', \&dataset_minmax);

In [3]:
# Function To Normalize a Dataset with tensor support
sub normalize_dataset {
    my ($self, $dataset, $minmax) = @_;
    
    if (ref($dataset) eq 'AI::MXNet::NDArray') {
        my $mins = $minmax->[0];
        my $ranges = $minmax->[1] - $mins;
        return ($dataset - $mins) / $ranges;
    } 
    elsif (ref($dataset) eq 'ARRAY') {
        for my $row (@$dataset) {
            for my $i (0 .. $#$row) {
                $row->[$i] = ($row->[$i] - $minmax->[$i][0]) / ($minmax->[$i][1] - $minmax->[$i][0]);
            }
        }
        return $dataset;
    }
}

sml->add_to_class('normalize_dataset', \&normalize_dataset);

In [4]:
# Function To Calculate Means For Each Column in a Dataset with tensor support
my $column_means = sub {
    my ($self, $dataset) = @_;
    
    if (ref($dataset) eq 'AI::MXNet::NDArray') {
        return $dataset->mean(axis => 0);
    }
    elsif (ref($dataset) eq 'ARRAY') {
        my $means = [0, map {$_} 0 .. $#{$dataset->[0]} -1];
        for my $i (0 .. $#{$dataset->[0]}) {
            my $col_values = [map {$_->[$i]} @$dataset];
            $means->[$i] = sum(@$col_values) / scalar(@$dataset);
        }
        return $means;
    }
};

sml->add_to_class('column_means', $column_means);

In [5]:
# Function To Calculate Standard Deviations For Each Column in a Dataset with tensor support
my $column_stdevs = sub {
    my ($self, $dataset, $means) = @_;
    
    if (ref($dataset) eq 'AI::MXNet::NDArray') {
        my $variance = ($dataset - $means)->square->mean(axis => 0);
        return $variance->sqrt;
    }
    elsif (ref($dataset) eq 'ARRAY') {
        my $stdevs = [ (0) x @{$dataset->[0]} ];
        for my $i (0 .. $#{$dataset->[0]}) {
            my $variance = [map { ($_->[$i] - $means->[$i]) ** 2 } @$dataset];
            $stdevs->[$i] = sqrt(sum(@$variance) / (scalar(@$dataset) - 1));
        }
        return $stdevs;
    }
};

sml->add_to_class('column_stdevs', $column_stdevs);

In [6]:
# Function To Standardize a Dataset with tensor support
sub standardize_dataset {
    my ($self, $dataset, $means, $stdevs) = @_;
    
    if (ref($dataset) eq 'AI::MXNet::NDArray') {
        return ($dataset - $means) / $stdevs;
    }
    elsif (ref($dataset) eq 'ARRAY') {
        for my $row (@$dataset) {
            for my $i (0 .. $#{$row}) {
                $row->[$i] = ($row->[$i] - $means->[$i]) / $stdevs->[$i];
            }
        }
        return $dataset;
    }
}

sml->add_to_class('standardize_dataset', \&standardize_dataset);

In [7]:
# Test with array input
my $dataset = [[50, 30], [20, 90]];
printf "%s\n", dump $dataset;
my $minmax = sml->dataset_minmax($dataset);
printf "%s\n", dump $minmax;

[[50, 30], [20, 90]]
[[20, 50], [30, 90]]


1

In [8]:
# Test normalization with array input
$dataset = [[50, 30], [20, 90]];
print dump $dataset;
$minmax = sml->dataset_minmax($dataset);
print "\n", dump $minmax;
sml->normalize_dataset($dataset, $minmax);
print "\n", dump $dataset;

[[50, 30], [20, 90]]
[[20, 50], [30, 90]]
[[1, 0], [0, 1]]

1

In [9]:
# Test statistics with array input
$dataset = [[50, 30], [20, 90], [30, 50]];
print dump $dataset;
my $means = sml->column_means($dataset);
my $stdevs = sml->column_stdevs($dataset, $means);
printf "Medias: %s\n", dump $means;
printf "Desviaciones estándar: %s\n", dump $stdevs;

[[50, 30], [20, 90], [30, 50]]Medias: [33.3333333333333, 56.6666666666667]
Desviaciones estándar: [15.2752523165195, 30.5505046330389]


1

In [10]:
# Test standardization with array input
printf "%s\n", dump $means;
printf "%s\n", dump $stdevs;
sml->standardize_dataset($dataset, $means, $stdevs);
printf "%s\n", dump $dataset;

[33.3333333333333, 56.6666666666667]
[15.2752523165195, 30.5505046330389]
[
  [1.09108945117996, -0.872871560943969],
  [-0.87287156094397, 1.09108945117996],
  [-0.218217890235993, -0.218217890235992],
]


1

In [11]:
# Test with Pima Indians dataset (array input)
my $filename = 'data/pima-indians-diabetes.csv';
$dataset = sml->load_csv($filename);
printf "Loaded data file %s with %d rows and %d columns.\n", $filename, scalar @$dataset, scalar @{$dataset->[0]};
print "[@{$dataset->[0]}]";

for my $i (0 .. $#{$dataset->[0]}) {
    sml->str_column_to_float($dataset, $i);
}

print "\n[@{$dataset->[0]}]";

$minmax = sml->dataset_minmax($dataset);
sml->normalize_dataset($dataset, $minmax);
print "\n[@{$dataset->[0]}]";

Loaded data file data/pima-indians-diabetes.csv with 768 rows and 9 columns.
[6 148 72 35 0 33.6 0.627 50 1]
[6.0 148.0 72.0 35.0 0.0 33.6 0.6 50.0 1.0]
[0.352941176470588 0.743718592964824 0.590163934426229 0.353535353535354 0 0.500745156482861 0.217391304347826 0.483333333333333 1]

1

In [12]:
# Test standardization with Pima Indians dataset
$filename = 'data/pima-indians-diabetes.csv';
$dataset = sml->load_csv($filename);
printf "Loaded data file %s with %d rows and %d columns. \n", $filename, scalar @$dataset, scalar @{$dataset->[0]};
for my $i (0 .. $#{$dataset->[0]}) {
    sml->str_column_to_float($dataset, $i);
}
printf "%s\n", dump $dataset->[0];

$minmax = sml->dataset_minmax($dataset);
sml->normalize_dataset($dataset, $minmax);
$means = sml->column_means($dataset);
$stdevs = sml->column_stdevs($dataset, $means);
sml->standardize_dataset($dataset, $means, $stdevs);
printf "%s\n", dump $dataset->[0];

Loaded data file data/pima-indians-diabetes.csv with 768 rows and 9 columns. 
["6.0", "148.0", "72.0", "35.0", "0.0", 33.6, 0.6, "50.0", "1.0"]
[
  0.639530492117648,
  0.847771320589669,
  0.149543298529545,
  0.906679062347249,
  -0.69243932472413,
  0.203879907267472,
  0.384829971238835,
  1.42506671959336,
  1.36500636695981,
]


1

In [13]:
# Test with tensor input
my $mx_dataset = mx->nd->array([[50, 30], [20, 90], [30, 50]]);
print "\n", $mx_dataset->aspdl;

my $mx_means = sml->column_means($mx_dataset);
print "Medias: \n", $mx_means->aspdl;

my $mx_stdevs = sml->column_stdevs($mx_dataset, $mx_means);
print "Desviaciones estándar: \n", $mx_stdevs->aspdl;

my $standardized = sml->standardize_dataset($mx_dataset, $mx_means, $mx_stdevs);
print "Standardized: \n", $standardized->aspdl;


[[50. 30.]
 [20. 90.]
 [30. 50.]]
<NDArray 3x2 @cpu(0)>
Medias: 
[33.333332 56.666668]
<NDArray 2 @cpu(0)>
Desviaciones estándar: 
[15.275252 30.550505]
<NDArray 2 @cpu(0)>
Standardized: 
[[ 1.0910895 -0.8728716]
 [-0.8728716  1.0910895]
 [-0.2182179 -0.2182179]]
<NDArray 3x2 @cpu(0)>


1