In [94]:
from tensor import Tensor, TensorShape
from math import exp
from memory import memset_zero
from utils.index import Index
from algorithm import vectorize, parallelize
from sys.info import simdwidthof, num_logical_cores
from pathlib import Path
from time import perf_counter_ns

In [2]:
alias simd_width = simdwidthof[DType.float64]()

fn relu_p(mut Z: Tensor[DType.float64]):
        @parameter
        fn par_relu(idx: Int):
            var val = Z.load(idx)
            val = val if val > 0.0 else 0.0
            Z.store(idx, val)
        parallelize[par_relu](Z.num_elements(), num_logical_cores())

fn relu_v(mut Z: Tensor[DType.float64]):
        @parameter
        fn vec_relu[simd_width: Int](idx: Int):
            var val = Z.load(idx)
            val = val if val > 0.0 else 0.0
            Z.store(idx, val)
        vectorize[vec_relu, 1](Z.num_elements())

fn relu_naive(mut Z: Tensor[DType.float64]):
        for i in range(Z.num_elements()):
            var val = Z.load(i)
            val = val if val > 0.0 else 0.0
            Z[i] = val

In [6]:
data = Tensor[DType.float64].rand(TensorShape(5,5))

In [7]:
print(data)

Tensor([[0.15127278977429223, 0.9488624771994429, 0.042604254384452735, 0.7350890711043733, 0.4654970324670612],
[0.22253954059087414, 0.5714842245677251, 0.6045694129188303, 0.6980017822078154, 0.6646856570492514],
[0.17365220998817027, 0.7514945491537615, 0.8249168959647479, 0.7530959085885771, 0.3190492933293315],
[0.53255122988184, 0.14755412846594945, 0.7926281295579826, 0.912530994870465, 0.7943811413703469],
[0.009408400669171892, 0.5100528533444851, 0.29559365769497986, 0.5599189394718442, 0.5041725145911186]], dtype=float64, shape=5x5)


In [3]:

fn get_test_data() -> Tensor[DType.float64]:
    var test = Tensor[DType.float64].rand(TensorShape(5,5))
    var iter = 6
    for i in range(iter):
        test[i] = -0.1234
    return test^

In [4]:
print("Testing relu (parallelized) ...")
test = get_test_data()
print(test)
var start = perf_counter_ns()
relu_test = relu_p(test)
var end = perf_counter_ns()
print("Time taken for parallel relu: ", end - start)

Testing relu (parallelized) ...
Tensor([[-0.1234, -0.1234, -0.1234, -0.1234, -0.1234],
[-0.1234, 0.8095666534273713, 0.511712552800469, 0.9950845483070253, 0.9666113633007903],
[0.4260508274422977, 0.6529987269106764, 0.9615331095757897, 0.8579873390871509, 0.29402614920162445],
[0.4146445788282588, 0.5148929051767301, 0.7897845320287156, 0.5442728017352926, 0.09362991190499959],
[0.43225952539313756, 0.8449274386695605, 0.7728464640854276, 0.19185895447404114, 0.7803667619751494]], dtype=float64, shape=5x5)
Time taken for parallel relu:  361159000


In [4]:
print("Testing relu (vectorized) ...")
test = get_test_data()
print(test)
var start = perf_counter_ns()
relu_test = relu_v(test)
var end = perf_counter_ns()
print("Time taken for vectorized relu: ", end - start)

Testing relu (vectorized) ...
Tensor([[-0.1234, -0.1234, -0.1234, -0.1234, -0.1234],
[-0.1234, 0.8095666534273713, 0.511712552800469, 0.9950845483070253, 0.9666113633007903],
[0.4260508274422977, 0.6529987269106764, 0.9615331095757897, 0.8579873390871509, 0.29402614920162445],
[0.4146445788282588, 0.5148929051767301, 0.7897845320287156, 0.5442728017352926, 0.09362991190499959],
[0.43225952539313756, 0.8449274386695605, 0.7728464640854276, 0.19185895447404114, 0.7803667619751494]], dtype=float64, shape=5x5)
Time taken for vectorized relu:  0


In [4]:
print("Testing relu (naive) ...")
test = get_test_data()
print(test)
var start = perf_counter_ns()
relu_test = relu_naive(test)
var end = perf_counter_ns()
print("Time taken for naive relu: ", end - start) 

Testing relu (naive) ...
Tensor([[-0.1234, -0.1234, -0.1234, -0.1234, -0.1234],
[-0.1234, 0.8095666534273713, 0.511712552800469, 0.9950845483070253, 0.9666113633007903],
[0.4260508274422977, 0.6529987269106764, 0.9615331095757897, 0.8579873390871509, 0.29402614920162445],
[0.4146445788282588, 0.5148929051767301, 0.7897845320287156, 0.5442728017352926, 0.09362991190499959],
[0.43225952539313756, 0.8449274386695605, 0.7728464640854276, 0.19185895447404114, 0.7803667619751494]], dtype=float64, shape=5x5)
Time taken for naive relu:  0


In [11]:
alias simd_width = simdwidthof[DType.float64]()

In [7]:
data = Tensor[DType.float64].rand(TensorShape(5,5))

In [8]:
print(data)

Tensor([[0.18134318080109246, 0.5791414993698616, 0.3141314421629415, 0.41198460537139914, 0.9923054645273369],
[0.16392240715441422, 0.3348481652178885, 0.07621820113073051, 0.17452387068320302, 0.03729991406033618],
[0.46741478351829246, 0.6741126849908573, 0.06670325213429425, 0.3897824960817898, 0.16539865616786326],
[0.9908042838518297, 0.8706391467036878, 0.6726526082342286, 0.5877058877560577, 0.2550143756256123],
[0.5930454446864455, 0.27172003800053696, 0.27048116584257825, 0.09593022874939665, 0.6325562987072375]], dtype=float64, shape=5x5)


In [43]:
from algorithm.reduction import sum 
from buffer import Buffer, NDBuffer, DimList

fn _sum(z: Tensor[DType.float64]) -> Float64:
        try:
            return sum(Buffer[DType.float64](z.unsafe_ptr(), z.num_elements()))
        except:
            return 0.0

In [52]:
testsum = Tensor[DType.float64].rand(TensorShape(1,3))

In [53]:
print(testsum)

Tensor([[0.15127278977429223, 0.9488624771994429, 0.042604254384452735]], dtype=float64, shape=1x3)


In [54]:
print(_sum(testsum))

1.142739521358188


In [66]:
@staticmethod
fn transpose(x: Tensor[DType.int8]) -> Tensor[DType.int8]:
    var shape = x.shape()
    var result = Tensor[DType.int8](TensorShape(shape[1], shape[0]))
    if x.num_elements() < 768:
        for i in range(shape[0]):
            for j in range(shape[1]):
                result[VariadicList(j, i)] = x[i, j]
    else:
        @parameter
        fn transpose_parallel(idx: Int):
            var i = idx // shape[1]
            var j = idx % shape[1]
            result[VariadicList(j, i)] = x[i, j]
        parallelize[transpose_parallel](shape[0] * shape[1], num_logical_cores())
    return result^

In [100]:
test_t = Tensor[DType.int8].rand(TensorShape(5,5))

In [76]:
print(test_t)

Tensor([[0, 0, 0, 0, 0],
[0, 1, 1, 0, 0],
[1, 0, 0, 1, 0],
[0, 1, 0, 1, 0],
[0, 0, 0, 0, 0]], dtype=int8, shape=5x5)


In [77]:
print(transpose(test_t))

Tensor([[0, 0, 1, 0, 0],
[0, 1, 0, 1, 0],
[0, 1, 0, 0, 0],
[0, 0, 1, 1, 0],
[0, 0, 0, 0, 0]], dtype=int8, shape=5x5)


In [128]:
from algorithm.reduction import sum, max
from buffer import Buffer, NDBuffer, DimList

@always_inline
fn _max(z: Tensor[DType.float64]) -> Float64:
    try:
        return max(
            Buffer[DType.float64](
                z.unsafe_ptr(), 
                z.num_elements()
            )
        )
    except:
        return 0.0

In [189]:

fn one_hot(Y: Tensor[DType.float64]) -> Tensor[DType.float64]:
    var num_classes = (_max(Y) + 1).cast[DType.int32]().value
    var num_samples = Y.num_elements()
    var result = Tensor[DType.float64](TensorShape(num_samples, num_classes))
    memset_zero(result.unsafe_ptr(), result.num_elements())    
    if num_samples < 768:
        for i in range(num_samples):
            result[VariadicList(i, (Y[i]).cast[DType.int32]().value)] = 1.0
    else:
        @parameter
        fn one_hot_parallel(i: Int):
            result[VariadicList(i, (Y[i]).cast[DType.int32]().value)] = 1.0
        parallelize[one_hot_parallel](num_samples, num_logical_cores())    
    return result^

In [186]:
dee = Tensor[DType.float64].rand(TensorShape(1,4))
dee[0] = 0
dee[1] = 1
dee[2] = 2
dee[3] = 3

In [190]:
print(dee)

Tensor([[0.0, 1.0, 2.0, 3.0]], dtype=float64, shape=1x4)


In [191]:
print((_max(dee)).cast[DType.int32]())

3


In [181]:
for i in range(dee.num_elements()):
    print(i)

0
1
2
3


In [192]:
print(one_hot(dee))

Tensor([[1.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 0.0],
[0.0, 0.0, 0.0, 1.0]], dtype=float64, shape=4x4)
