# Part 1: Introduction to Tensors

![](./images/image%20copy%2014.png)

In [1]:
import numpy as np

class Tensor(object):
    """
    A simple wrapper around a NumPy array to illustrate
    how a custom tensor class might behave in a tiny NN framework.
    """

    def __init__(self, data):
        # Store the data as a NumPy array, regardless of the initial input type (e.g., list).
        self.data = np.array(data)

    def __add__(self, other):
        """
        Overload the '+' operator so that you can do:
            Tensor(...) + Tensor(...)
        This performs elementwise addition of the internal NumPy arrays,
        and returns a new Tensor with the result.
        """
        return Tensor(self.data + other.data)

    def __repr__(self):
        """
        Define a 'representation' for debugging or interactive sessions.
        We simply return the string representation of the underlying NumPy array.
        """
        return str(self.data.__repr__())

    def __str__(self):
        """
        Define how the Tensor prints (e.g., with print(x)).
        It returns the string form of the NumPy array data.
        """
        return str(self.data.__str__())

# Create a Tensor 'x' from a Python list
x = Tensor([1, 2, 3, 4, 5])
print(x)  # This uses x.__str__ and should print "[1 2 3 4 5]"

# Add 'x' to itself using our overloaded '+' operator.
y = x + x
print(y)  # Prints the result of elementwise addition "[ 2  4  6  8 10]"


[1 2 3 4 5]
[ 2  4  6  8 10]


# Part 2: Introduction to Autograd

![](./images/image%20copy%2013.png)

In [2]:
import numpy as np

class Tensor(object):
    """
    A simple Tensor class that supports basic "add" operation
    and can backpropagate (compute gradients) for the 'add' op.
    """
    
    def __init__(self, data, creators=None, creation_op=None):
        """
        Parameters:
        -----------
        data : array-like
            The raw numerical values that this Tensor holds.
        creators : list of Tensors or None
            References to any Tensor objects used to create this Tensor
            (i.e., the "parents" in a computation graph).
        creation_op : str or None
            The operation that created this Tensor (e.g., "add").
        
        Attributes:
        -----------
        self.data : ndarray
            NumPy array holding the actual numerical values.
        self.creation_op : str
            Indicates which operation led to this Tensor.
        self.creators : list of Tensors
            The Tensor objects that were inputs to the creation_op.
        self.grad : None or ndarray
            Will hold the gradient (partial derivatives) once backprop is called.
        """
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None   # Will store gradient when 'backward' is called.

    def backward(self, grad):
        """
        Perform backpropagation from this Tensor, distributing 'grad'
        to its parents (creators) based on the creation_op.

        Parameters:
        -----------
        grad : Tensor
            The gradient from the next level in the graph, 
            telling us how the final output changes 
            w.r.t. this Tensor's data.
        """
        # Store the gradient received
        self.grad = grad

        # If this Tensor was created by addition,
        # then both of the input Tensors contributed equally (chain rule).
        # So we pass the same gradient on to each creator.
        if self.creation_op == "add":
            # self.creators[0] and self.creators[1] are the two Tensors that were added
            self.creators[0].backward(grad)
            self.creators[1].backward(grad)

    def __add__(self, other):
        """
        Overload the '+' operator to return a new Tensor.
        'creators' is a list containing the two Tensors involved,
        'creation_op' is set to "add" so we know how to backprop later.
        """
        return Tensor(self.data + other.data,
                      creators=[self, other],
                      creation_op="add")

    def __repr__(self):
        # For debugging: returns a string representation of the underlying numpy array
        return str(self.data.__repr__())

    def __str__(self):
        # For printing: returns a nicer string version of the numpy array
        return str(self.data.__str__())


# Example usage:
x = Tensor([1, 2, 3, 4, 5])
y = Tensor([2, 2, 2, 2, 2])

# 'z' is a new Tensor resulting from x + y
z = x + y

# We call 'backward' on 'z', providing a gradient to "kick off" backprop.
# Suppose the gradient is Tensor([1,1,1,1,1]) for demonstration.
z.backward(Tensor(np.array([1,1,1,1,1])))

# If you now inspect x.grad or y.grad, you'd see that each 
# has the gradient passed on from z (in this case, [1,1,1,1,1]).


In [3]:
print(x.grad)
print(y.grad)
print(z.creators)
print(z.creation_op)

[1 1 1 1 1]
[1 1 1 1 1]
[array([1, 2, 3, 4, 5]), array([2, 2, 2, 2, 2])]
add


In [4]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])
d = Tensor([-1,-2,-3,-4,-5])

e = a + b
f = c + d
g = e + f

g.backward(Tensor(np.array([1,1,1,1,1])))

print(a.grad)

[1 1 1 1 1]


# Part 3: Tensors That Are Used Multiple Times

In [5]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])

d = a + b
e = b + c
f = d + e
f.backward(Tensor(np.array([1,1,1,1,1])))

b.grad.data == np.array([2,2,2,2,2])

array([False, False, False, False, False])

# Part 4: Upgrading Autograd to Support Multiple Tensors

![](./images/image%20copy%2012.png)

In [6]:
import numpy as np

class Tensor(object):
    
    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        """
        Parameters
        ----------
        data : array-like
            Numerical data for this tensor.
        autograd : bool
            Whether this tensor requires gradient tracking.
        creators : list or None
            If this tensor was created by an operation involving other tensors,
            this is the list of 'parent' Tensors.
        creation_op : str or None
            The operation ('add', 'mul', etc.) that created this Tensor.
        id : int or None
            Unique identifier; if not provided, a random int is assigned.

        Attributes
        ----------
        data : np.ndarray
            The numerical data stored for this Tensor.
        autograd : bool
            Whether this Tensor tracks gradients.
        grad : Tensor or None
            Accumulated gradient for backpropagation.
        creators : list of Tensors or None
            Parent Tensors that were involved in creating this Tensor.
        creation_op : str
            The name of the operation that created this Tensor.
        children : dict
            Map of child IDs to a counter; helps manage gradient flow
            so we know when all children have backpropagated.
        """
        
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None

        # A unique ID for tracking children/parents in the graph
        if id is None:
            self.id = np.random.randint(0, 100000)
        else:
            self.id = id
        
        self.creators = creators
        self.creation_op = creation_op
        self.children = {}  # Maps child tensor IDs -> how many grads we expect

        # If this tensor has creators, register as a 'child' in each creator
        if creators is not None:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        """
        Check if all expected gradient flows from the child Tensors
        have been received. If every child ID is at 0, 
        it means we've accounted for all backprop calls from them.
        """
        for id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True        

    def backward(self, grad=None, grad_origin=None):
        """
        Backpropagate from this Tensor, accumulating gradients and 
        passing them to parent Tensors if needed.

        Parameters
        ----------
        grad : Tensor or None
            Gradient coming in from a child or final output.
            If None, we assume ones-like this Tensor's shape for the gradient.
        grad_origin : Tensor or None
            The child Tensor from which this backward call originated.
            Helps manage how many times each parent is called.
        """
        if self.autograd:
            # If no gradient was provided, default to a Tensor of ones
            if grad is None:
                grad = Tensor(np.ones_like(self.data))

            # If we know which child is passing the gradient,
            # decrement the count of expected gradients from that child.
            if grad_origin is not None:
                if self.children[grad_origin.id] == 0:
                    raise Exception("Cannot backprop more than once from the same child.")
                else:
                    self.children[grad_origin.id] -= 1

            # Accumulate this gradient into self.grad
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            # Assert that grad does not itself require autograd
            # (we expect a leaf gradient, not a nested one)
            assert grad.autograd == False

            # Only continue backprop if this Tensor has parents (creators)
            # and all children have backprop'd, or if this is the final call 
            # (grad_origin is None => we called backward on this Tensor directly).
            if (self.creators is not None 
                and (self.all_children_grads_accounted_for() 
                     or grad_origin is None)):

                # Depending on creation_op, distribute gradient to parents
                if self.creation_op == "add":
                    # In an 'add' operation, the gradient flows equally 
                    # to both parent Tensors.
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
    def __add__(self, other):
        """
        Overload the '+' operator. 
        If both Tensors require autograd, we track the parents/operation.
        Otherwise, just return a new Tensor with the sum of data.
        """
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  

# --------------------------------------------------------------------
# Demonstration:
# --------------------------------------------------------------------

# Create three Tensors that require gradients
a = Tensor([1,2,3,4,5], autograd=True)
b = Tensor([2,2,2,2,2], autograd=True)
c = Tensor([5,4,3,2,1], autograd=True)

# Perform some additions
d = a + b
e = b + c
f = d + e  # f = (a + b) + (b + c) = a + 2b + c

# Backprop from f with an incoming gradient of [1,1,1,1,1].
# This simulates "df/dx = 1" for each element, to see how 
# the partial derivatives distribute.
f.backward(Tensor(np.array([1,1,1,1,1])))

# At this point, a.grad, b.grad, c.grad should be populated 
# with the correct gradients from the chain rule. 
# We check b.grad against an expected value [2,2,2,2,2].
print(b.grad.data == np.array([2,2,2,2,2]))
# This should print an array of True values or a single bool True 
# if everything matches elementwise.


[ True  True  True  True  True]


# Part 5: Add Support for Negation

![](./images/image%20copy%2011.png)

In [7]:
import numpy as np

class Tensor(object):
    
    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        """
        A simple Tensor class supporting basic operations (add, neg) 
        and backpropagation.
        
        Parameters
        ----------
        data : array-like
            Numerical data for this tensor, stored as a NumPy array.
        autograd : bool
            If True, this Tensor will participate in gradient tracking.
        creators : list of Tensors or None
            Parent Tensors that created this Tensor (if any).
        creation_op : str or None
            The operation used to create this Tensor ("add", "neg", etc.).
        id : int or None
            Unique identifier for this Tensor. If None, a random ID is generated.
        """
        
        # Convert input data to a NumPy array
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None

        # Assign or generate an ID
        if id is None:
            self.id = np.random.randint(0, 100000)
        else:
            self.id = id
        
        self.creators = creators      # Tensors that were used to create this one
        self.creation_op = creation_op
        self.children = {}            # Dict to track how many times each child backprop is expected

        # If this Tensor was created from other Tensors, register it as a child in those parent Tensors
        if creators is not None:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        """
        Check if all expected gradients from child Tensors have arrived.
        If any child's count is not 0, we're still waiting for that child's gradient.
        """
        for id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True        
        
    def backward(self, grad=None, grad_origin=None):
        """
        Backpropagate from this Tensor, distributing gradients to its parents.
        
        Parameters
        ----------
        grad : Tensor or None
            The incoming gradient from the 'child' or from a final output.
            If None, a default gradient of ones is used (like "d(output)/d(this) = 1").
        grad_origin : Tensor or None
            Which child Tensor is sending this gradient. This helps ensure
            we only backprop once per child.
        """
        if self.autograd:
            # If no gradient is provided, assume a gradient of 1s for the entire shape
            if grad is None:
                grad = FloatTensor(np.ones_like(self.data))
            
            # If we're receiving a gradient from a specific child, decrement the child's counter
            if grad_origin is not None:
                if self.children[grad_origin.id] == 0:
                    raise Exception("cannot backprop more than once from the same child")
                else:
                    self.children[grad_origin.id] -= 1

            # Accumulate the incoming gradient into self.grad
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            # The incoming gradient must not require its own grad (i.e., a leaf grad)
            assert grad.autograd == False
            
            # If this Tensor has parents, and we've accounted for all child grads, or this is a direct call:
            if (self.creators is not None 
                and (self.all_children_grads_accounted_for() or grad_origin is None)):

                # Backprop logic depending on the operation that created this Tensor
                if self.creation_op == "add":
                    # In 'add', gradient is passed to both parents unchanged
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
                if self.creation_op == "neg":
                    # In 'neg', if z = -x, then dz/dx = -1, 
                    # so we pass the negative of self.grad to the parent
                    self.creators[0].backward(self.grad.__neg__())
                    
    def __add__(self, other):
        """
        Overload the '+' operator.
        If both Tensors track gradients, create a new Tensor with 'creation_op="add"'
        and reference them as creators. Otherwise, just do raw NumPy addition.
        """
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        """
        Overload the unary negation operator (i.e., -self).
        If autograd is True, record 'neg' as the creation_op.
        """
        if self.autograd:
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(self.data * -1) 
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  

# --------------------------------------------------------------------
# DEMONSTRATION
# --------------------------------------------------------------------

# Create a few Tensors that require gradient tracking
a = Tensor([1,2,3,4,5], autograd=True)
b = Tensor([2,2,2,2,2], autograd=True)
c = Tensor([5,4,3,2,1], autograd=True)

# Use negation to build an expression
d = a + (-b)  # => a - b
e = (-b) + c  # => c - b
f = d + e     # => (a - b) + (c - b) = a + c - 2b

# Now backprop from f with an incoming gradient of [1,1,1,1,1]
f.backward(Tensor(np.array([1,1,1,1,1])))

# Check that b's gradient is -2 across all elements (since f w.r.t b is -2)
print(b.grad.data == np.array([-2,-2,-2,-2,-2]))
# This should print a boolean array or a single boolean True if all match


[ True  True  True  True  True]


# Part 6: Add Support for Additional Functions

![](./images/image%20copy%2010.png)

In [8]:
import numpy as np

class Tensor(object):
    
    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        """
        A more complete Tensor class that supports:
          - Basic elementwise operations (+, -, *)
          - Matrix multiplication (mm)
          - Summation along a dimension
          - Transposition
          - "Expand" operation to replicate data
          - Negation
        and an auto-differentiation mechanism for them.

        Parameters:
        -----------
        data : array-like
            The underlying numerical data (stored as NumPy array).
        autograd : bool
            Whether this Tensor tracks gradients.
        creators : list of Tensors or None
            Parent Tensors that created this Tensor through an operation.
        creation_op : str or None
            The name of the operation that created this Tensor.
        id : int or None
            An optional unique ID. If None, a random ID is assigned.

        Attributes:
        -----------
        data : np.ndarray
            The numerical values for this Tensor.
        autograd : bool
            Whether gradient tracking is active.
        grad : Tensor or None
            Accumulated gradient.
        creators : list or None
            Tensors that were used in creating this one (parents).
        creation_op : str
            Operation name ("add", "sub", "mul", "mm", etc.).
        children : dict
            A map {child_id: count}, tracking how many times each child 
            needs to backprop for this Tensor.
        """
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None

        # Assign an ID for this Tensor
        if id is None:
            self.id = np.random.randint(0, 100000)
        else:
            self.id = id
        
        self.creators = creators
        self.creation_op = creation_op
        self.children = {}  # child_id -> how many backprops expected

        # If there are parent Tensors, register this as a child
        if creators is not None:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        """
        Check if all children have backpropagated.
        Returns True if every child's count is zero.
        """
        for child_id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True 

    def backward(self, grad=None, grad_origin=None):
        """
        Backpropagate through this Tensor, accumulating gradient and 
        passing it on to its parents based on creation_op rules.

        Parameters:
        -----------
        grad : Tensor or None
            The gradient from the next level in the graph. If None, 
            defaults to a Tensor of ones shaped like self.data.
        grad_origin : Tensor or None
            The child Tensor sending this gradient; helps avoid double backprop.
        """
        # Only do anything if autograd is True
        if self.autograd:

            # If no gradient given, assume an array of ones
            if grad is None:
                grad = Tensor(np.ones_like(self.data))

            # Decrement the child's expected gradient count
            if grad_origin is not None:
                if self.children[grad_origin.id] == 0:
                    raise Exception("cannot backprop more than once from the same child")
                else:
                    self.children[grad_origin.id] -= 1

            # Accumulate gradient in self.grad
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            # The incoming grad must not require grad
            assert grad.autograd == False

            # If there are parent Tensors and we've received all child grads OR 
            # grad_origin is None (meaning we called .backward() directly):
            if (self.creators is not None and
               (self.all_children_grads_accounted_for() or grad_origin is None)):

                # Distribute gradients to parents based on the operation

                if self.creation_op == "add":
                    # x + y => gradient passes unchanged to both x and y
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
                elif self.creation_op == "sub":
                    # x - y => gradient to x is +grad, gradient to y is -grad
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor(-self.grad.data), self)

                elif self.creation_op == "mul":
                    # x * y => grad w.r.t x is grad * y, w.r.t y is grad * x
                    new = self.grad * self.creators[1]  # replicate shape if needed
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)

                elif self.creation_op == "mm":
                    # Matrix multiply
                    # if z = x.mm(y), then dz/dx = grad.mm(y^T), dz/dy = x^T.mm(grad)
                    c0 = self.creators[0]
                    c1 = self.creators[1]

                    # grad for x
                    new = self.grad.mm(c1.transpose())
                    c0.backward(new, self)

                    # grad for y
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new, self)

                elif self.creation_op == "transpose":
                    # If z = x.transpose(), the gradient is just grad.transpose()
                    self.creators[0].backward(self.grad.transpose(), self)

                elif "sum" in self.creation_op:
                    # Sums along a dimension
                    # e.g. "sum_0" => summation along dim=0
                    dim = int(self.creation_op.split("_")[1])
                    # Expand the gradient back to the original shape
                    self.creators[0].backward(self.grad.expand(dim,
                                            self.creators[0].data.shape[dim]), self)

                elif "expand" in self.creation_op:
                    # The inverse of 'expand' is summation along the dimension that was expanded
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim), self)
                    
                elif self.creation_op == "neg":
                    # If z = -x => grad for x is -self.grad
                    self.creators[0].backward(self.grad.__neg__(), self)
                    
    # ----------------------
    # Overloaded Operators
    # ----------------------

    def __add__(self, other):
        """
        x + y => if both autograd, track the operation,
        otherwise just do np addition.
        """
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        """
        -x => if autograd is True, store op="neg".
        """
        if self.autograd:
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(self.data * -1)
    
    def __sub__(self, other):
        """
        x - y => if autograd, creation_op="sub", else just subtract data.
        """
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        """
        x * y => if autograd, creation_op="mul", else just multiply data.
        """
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    # ----------------------
    # Additional Methods
    # ----------------------
    
    def sum(self, dim):
        """
        Summation along a specified dimension.
        creation_op="sum_<dim>" so we know how to backprop properly.
        """
        if self.autograd:
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim, copies):
        """
        'expand' to replicate data along a given dimension 'copies' times.
        Example: If shape is (2,3) and expand(0,4) => shape (4,2,3).
        We record creation_op for correct backprop.
        """
        trans_cmd = list(range(0,len(self.data.shape)))
        # Insert the new axis at 'dim'
        trans_cmd.insert(dim, len(self.data.shape))
        
        # Repeat the data 'copies' times, then reshape and transpose
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)
        
        if self.autograd:
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        """
        If autograd, keep track with creation_op="transpose".
        """
        if self.autograd:
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")
        
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        """
        Matrix multiplication (dot product).
        If autograd, record creation_op="mm".
        """
        if self.autograd:
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self, x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    # ----------------------
    # Utility / Display
    # ----------------------

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  


# ------------------------------------------------------------------------------
# DEMONSTRATION OF BASIC USAGE
# ------------------------------------------------------------------------------
a = Tensor([1,2,3,4,5], autograd=True)
b = Tensor([2,2,2,2,2], autograd=True)
c = Tensor([5,4,3,2,1], autograd=True)

# Some operations
d = a + b
e = b + c
f = d + e  # effectively: (a+b) + (b+c) = a + 2b + c

# Backprop from f with gradient [1,1,1,1,1]
f.backward(Tensor(np.array([1,1,1,1,1])))

# Now b's gradient is from the 2*b in that expression => [2,2,2,2,2]
print(b.grad.data == np.array([2,2,2,2,2]))


[ True  True  True  True  True]


# A few Notes on Sum and Expand

In [9]:
x = Tensor(np.array([[1,2,3],
                     [4,5,6]]))

In [10]:
x.sum(0)

array([5, 7, 9])

In [11]:
x.sum(1)

array([ 6, 15])

In [12]:
x.expand(dim=2, copies=4)

array([[[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]],

       [[4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6]]])

# Part 7: Use Autograd to Train a Neural Network

#### Previously we would train a model like this

In [13]:
import numpy
np.random.seed(0)

data = np.array([[0,0],[0,1],[1,0],[1,1]])
target = np.array([[0],[1],[0],[1]])

weights_0_1 = np.random.rand(2,3)
weights_1_2 = np.random.rand(3,1)

for i in range(10):
    
    # Predict
    layer_1 = data.dot(weights_0_1)
    layer_2 = layer_1.dot(weights_1_2)
    
    # Compare
    diff = (layer_2 - target)
    sqdiff = (diff * diff)
    loss = sqdiff.sum(0) # mean squared error loss

    # Learn: this is the backpropagation piece
    layer_1_grad = diff.dot(weights_1_2.transpose())
    weight_1_2_update = layer_1.transpose().dot(diff)
    weight_0_1_update = data.transpose().dot(layer_1_grad)
    
    weights_1_2 -= weight_1_2_update * 0.1
    weights_0_1 -= weight_0_1_update * 0.1
    print(loss[0])

5.066439994622396
0.4959907791902341
0.4180671892167177
0.35298133007809646
0.2972549636567376
0.24923260381633278
0.20785392075862477
0.17231260916265181
0.14193744536652994
0.11613979792168387


![](./images/image%20copy%209.png)

In [14]:
import numpy as np
np.random.seed(0)

# 1) Prepare Data
#    Here, 'data' is a 4x2 matrix representing the inputs:
#        [0,0], [0,1], [1,0], [1,1]
#    and 'target' is a 4x1 matrix of desired outputs:
#        0, 1, 0, 1
data = Tensor(np.array([[0,0],
                        [0,1],
                        [1,0],
                        [1,1]]),
              autograd=True)

target = Tensor(np.array([[0],
                          [1],
                          [0],
                          [1]]),
                autograd=True)

# 2) Initialize Weights
#    We create a list 'w' holding two weight matrices:
#    w[0]: shape (2,3)
#    w[1]: shape (3,1)
#    This effectively creates a small 2-layer network:
#         input -> (2x3) -> hidden -> (3x1) -> output
w = list()
w.append(Tensor(np.random.rand(2,3), autograd=True))
w.append(Tensor(np.random.rand(3,1), autograd=True))

# 3) Training Loop
for i in range(10):  # We'll do 10 training iterations

    # a) Forward pass
    #    data.mm(w[0]) => shape (4,3)
    #    .mm(w[1]) => shape (4,1)
    #    This is our network's prediction for each of the 4 samples
    pred = data.mm(w[0]).mm(w[1])
    
    # b) Compute Loss
    #    Here, we use MSE-like loss: (pred - target)^2, then sum across all samples
    #    shape of pred and target => (4,1)
    #    (pred - target)*(pred - target) => elementwise square
    #    sum(0) => summation across the "batch" dimension
    loss = ((pred - target) * (pred - target)).sum(0)
    
    # c) Backpropagate
    #    We call .backward(...) on the loss. Because the loss is shape (1,),
    #    we pass a gradient of 1 to backprop for that single scalar.
    loss.backward(Tensor(np.ones_like(loss.data)))

    # d) Gradient Descent Updates
    #    For each weight matrix w_ in w, we do:
    #        w_.data -= (learning_rate * w_.grad.data)
    #    Then set w_.grad.data to zero to clear out old gradients before next iteration
    for w_ in w:
        w_.data -= w_.grad.data * 0.1
        w_.grad.data *= 0  # reset gradient to zero

    # e) Print the current loss for monitoring
    #    'loss' is a Tensor of shape (1,)
    print(loss)


[0.58128304]
[0.48988149]
[0.41375111]
[0.34489412]
[0.28210124]
[0.2254484]
[0.17538853]
[0.1324231]
[0.09682769]
[0.06849361]


# Part 8: Adding Automatic Optimization

In [15]:
class SGD(object):
    
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
        
    def step(self, zero=True):
        
        for p in self.parameters:
            
            p.data -= p.grad.data * self.alpha
            
            if(zero):
                p.grad.data *= 0

In [16]:
import numpy
np.random.seed(0)

data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

w = list()
w.append(Tensor(np.random.rand(2,3), autograd=True))
w.append(Tensor(np.random.rand(3,1), autograd=True))

optim = SGD(parameters=w, alpha=0.1)

for i in range(10):

    # Predict
    pred = data.mm(w[0]).mm(w[1])
    
    # Compare
    loss = ((pred - target)*(pred - target)).sum(0)
    
    # Learn
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()

    print(loss)

[0.58128304]
[0.48988149]
[0.41375111]
[0.34489412]
[0.28210124]
[0.2254484]
[0.17538853]
[0.1324231]
[0.09682769]
[0.06849361]


# Part 9: Adding Support for Layer Types

![](./images/image%20copy%208.png)

In [17]:
class Layer(object):
    """
    A base Layer class that any specific layer (e.g., Linear) can inherit from.
    Typically, it keeps track of a list of parameters,
    and has a method to fetch them for optimization.
    """
    
    def __init__(self):
        # We'll store layer parameters (weights, biases, etc.) in a list
        self.parameters = list()
        
    def get_parameters(self):
        """
        Return the list of parameters for this layer.
        This can be used by an optimizer to update them.
        """
        return self.parameters


class Linear(Layer):
    """
    A 'Linear' layer (also called a fully connected or dense layer).
    y = xW + b
    """

    def __init__(self, n_inputs, n_outputs):
        """
        Parameters
        ----------
        n_inputs : int
            The dimensionality of the input features.
        n_outputs : int
            The number of output features for this linear transform.

        We'll initialize:
          - self.weight: a Tensor of shape (n_inputs, n_outputs)
          - self.bias  : a Tensor of shape (n_outputs,)
        """
        super().__init__()  # Initialize the base Layer

        # Using a random initialization:
        # We scale by sqrt(2/n_inputs) (Kaiming-like init) to help with stable training
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0 / n_inputs)
        
        # Turn these NumPy arrays into Tensors that track gradients
        self.weight = Tensor(W, autograd=True)
        self.bias = Tensor(np.zeros(n_outputs), autograd=True)
        
        # Register these Tensors as parameters of this layer
        self.parameters.append(self.weight)
        self.parameters.append(self.bias)

    def forward(self, input):
        """
        The forward pass for a linear layer:
        output = input.mm(self.weight) + self.bias

        We expand the bias across dimension 0 to match the batch size
        if input is (batch_size, n_inputs).
        """
        return input.mm(self.weight) + self.bias.expand(0, len(input.data))


# Part 10: Layers Which Contain Layers

![](./images/image%20copy%207.png)

In [18]:
class Sequential(Layer):
    """
    A container that holds multiple layers in sequence.
    When you call 'forward', it feeds data through each layer in turn.
    """

    def __init__(self, layers=list()):
        super().__init__()
        self.layers = layers  # A list of Layer objects (e.g., Linear, etc.)

    def add(self, layer):
        """
        Append another layer to the end of the sequence.
        """
        self.layers.append(layer)
        
    def forward(self, input):
        """
        Pass the 'input' through each layer in self.layers sequentially.
        The output of each layer becomes the input to the next.
        """
        for layer in self.layers:
            input = layer.forward(input)
        return input
    
    def get_parameters(self):
        """
        Gather parameters from each layer and return them in a single list.
        This allows an optimizer to easily update all layer parameters together.
        """
        params = list()
        for l in self.layers:
            params += l.get_parameters()  # extend the list
        return params
    

# ------------------------
# DEMO TRAINING SCRIPT
# ------------------------
import numpy as np
np.random.seed(0)

# 1) Define the dataset
#    data: 4 samples with 2 features each
#    target: 4 samples with 1 label each
data = Tensor(np.array([[0,0],
                        [0,1],
                        [1,0],
                        [1,1]]),
              autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]),
                autograd=True)

# 2) Create a model
#    'Sequential' of two 'Linear' layers:
#     - first takes 2 -> 3
#     - second takes 3 -> 1
model = Sequential([
    Linear(2,3),
    Linear(3,1)
])

# 3) Create an optimizer (like SGD).
#    'model.get_parameters()' returns all layer parameters (weights, biases).
optim = SGD(parameters=model.get_parameters(), alpha=0.05)

# 4) Training loop
for i in range(10):
    
    # a) Forward pass: pass 'data' through the model
    pred = model.forward(data)
    
    # b) Compute loss (Mean Squared Error):
    #    (pred - target)^2, then sum across all samples
    loss = ((pred - target) * (pred - target)).sum(0)
    
    # c) Backprop: compute gradients w.r.t. all model parameters
    loss.backward(Tensor(np.ones_like(loss.data)))
    
    # d) Update parameters using the optimizer
    optim.step()
    
    # e) Print the loss after each iteration
    print(loss)


[2.33428272]
[0.06743796]
[0.0521849]
[0.04079507]
[0.03184365]
[0.02479336]
[0.01925443]
[0.01491699]
[0.01153118]
[0.00889602]


# Part 11: Loss Function Layers

![](./images/image%20copy%206.png)

In [19]:
class MSELoss(Layer):
    """
    A layer that computes the Mean Squared Error (MSE) loss
    between predictions (pred) and targets (target).
    """

    def __init__(self):
        super().__init__()  # We don't have any parameters in this layer.

    def forward(self, pred, target):
        """
        Given predicted values 'pred' and true values 'target',
        return the sum of squared errors: ((pred - target)^2).sum(0)

        shape of pred, target => e.g. (batch_size, num_outputs)
        we sum across the batch dimension (dim=0) here.
        """
        return ((pred - target) * (pred - target)).sum(0)


import numpy as np
np.random.seed(0)

# 1) Create a small dataset for a simple binary classification-like task
#    data is 4 samples of dimension 2 (like an XOR pattern),
#    target is 4 samples of dimension 1 indicating the label
data = Tensor(np.array([[0,0],
                        [0,1],
                        [1,0],
                        [1,1]]),
              autograd=True)
target = Tensor(np.array([[0],
                          [1],
                          [0],
                          [1]]),
                autograd=True)

# 2) Build a model using Sequential:
#    - first layer: Linear(2 -> 3)
#    - second layer: Linear(3 -> 1)
model = Sequential([
    Linear(2,3),
    Linear(3,1)
])

# 3) Define the loss function
criterion = MSELoss()

# 4) Define an optimizer (e.g., SGD) to update model's parameters
optim = SGD(parameters=model.get_parameters(), alpha=0.05)

# 5) Training loop
for i in range(10):
    
    # a) Forward pass: get model predictions
    pred = model.forward(data)
    
    # b) Compute loss via the MSELoss layer
    loss = criterion.forward(pred, target)
    
    # c) Backprop: compute gradients w.r.t. model parameters
    loss.backward(Tensor(np.ones_like(loss.data)))
    
    # d) Update parameters in the model
    optim.step()
    
    # e) Print the current loss
    print(loss)


[2.33428272]
[0.06743796]
[0.0521849]
[0.04079507]
[0.03184365]
[0.02479336]
[0.01925443]
[0.01491699]
[0.01153118]
[0.00889602]


# Part 12: Non-linearity Layers

![](./images/image%20copy%205.png)

In [20]:
import numpy as np

class Tensor(object):
    """
    A Tensor class supporting automatic differentiation
    for various operations: +, -, *, matrix multiply (mm),
    sum/expand, transpose, negation, and the nonlinearities
    'sigmoid' and 'tanh'.
    """

    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        """
        Parameters
        ----------
        data : array-like
            NumPy array holding the numerical data.
        autograd : bool
            If True, we track gradients.
        creators : list of Tensor or None
            Parent Tensors in the computation graph.
        creation_op : str or None
            The name of the operation that created this Tensor
            (e.g., 'add', 'neg', 'sigmoid', etc.).
        id : int or None
            Unique ID for this Tensor. If None, a random one is assigned.
        """
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None

        if id is None:
            self.id = np.random.randint(0, 100000)
        else:
            self.id = id
        
        self.creators = creators
        self.creation_op = creation_op
        self.children = {}  # track how many gradients to expect from child Tensors
        
        # If this Tensor has parents, register it as a child in those parent Tensors
        if creators is not None:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        """
        Check if all child Tensors have sent their gradients to this Tensor.
        If any child's count is not zero, we're still waiting for that child's gradient.
        """
        for id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True 
        
    def backward(self, grad=None, grad_origin=None):
        """
        Backpropagate from this Tensor, distributing gradients 
        to its parents based on the operation that created it.
        
        Parameters
        ----------
        grad : Tensor or None
            The incoming gradient from a child or final output.
            If None, defaults to a Tensor of ones with the same shape as self.
        grad_origin : Tensor or None
            Which child Tensor is sending this gradient (helps detect double backprop).
        """
        if self.autograd:
            # If no incoming gradient, assume it's ones for each element
            if grad is None:
                grad = Tensor(np.ones_like(self.data))

            # Decrement the child's expected gradient count
            if grad_origin is not None:
                if self.children[grad_origin.id] == 0:
                    raise Exception("Cannot backprop more than once from the same child.")
                else:
                    self.children[grad_origin.id] -= 1

            # Accumulate gradient in self.grad
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            # The incoming gradient must not have its own gradient
            # i.e., it should be a leaf gradient in the context of our system
            assert grad.autograd == False
            
            # If all children have sent gradients (or grad_origin is None => direct call),
            # we proceed to backprop to our parents
            if (self.creators is not None and
                (self.all_children_grads_accounted_for() or grad_origin is None)):

                # Depending on which operation created this Tensor,
                # apply the chain rule to pass gradients back to parents.
                
                if self.creation_op == "add":
                    # x + y => partial derivative wrt x, y is just 1
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
                elif self.creation_op == "sub":
                    # x - y => partial wrt x is +1, wrt y is -1
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor((-self.grad).data), self)

                elif self.creation_op == "mul":
                    # x * y => grad wrt x is grad * y, wrt y is grad * x
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new , self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)                    
                    
                elif self.creation_op == "mm":
                    # matrix multiply
                    # if z = x.mm(y), then dz/dx = grad.mm(y^T), dz/dy = x^T.mm(grad)
                    c0 = self.creators[0]
                    c1 = self.creators[1]

                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)

                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)
                    
                elif self.creation_op == "transpose":
                    # if z = x.transpose(), then grad wrt x is grad.transpose()
                    self.creators[0].backward(self.grad.transpose())

                elif "sum" in self.creation_op:
                    # sum_<dim>, e.g. sum_0
                    dim = int(self.creation_op.split("_")[1])
                    shape_dim = self.creators[0].data.shape[dim]
                    # expand the gradient back to the shape of the original
                    self.creators[0].backward(self.grad.expand(dim, shape_dim))

                elif "expand" in self.creation_op:
                    # If we expanded along a dimension, the inverse is sum along that dimension
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                    
                elif self.creation_op == "neg":
                    # z = -x => derivative wrt x is -1
                    self.creators[0].backward(self.grad.__neg__())
                    
                elif self.creation_op == "sigmoid":
                    # z = sigmoid(x) => dz/dx = z*(1-z)
                    # chain rule => grad * z*(1-z)
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                
                elif self.creation_op == "tanh":
                    # z = tanh(x) => dz/dx = 1 - z^2
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                    
    # ----------------------
    # Overloaded Operators
    # ----------------------

    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if self.autograd:
            return Tensor(-self.data,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(-self.data)
    
    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    # -----------
    # Sum, Expand
    # -----------

    def sum(self, dim):
        if self.autograd:
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_" + str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim, copies):
        """
        Expand the Tensor along a given dimension 'copies' times.
        """
        trans_cmd = list(range(0, len(self.data.shape)))
        # Insert new axis at the 'dim' index
        trans_cmd.insert(dim, len(self.data.shape))

        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies])
        new_data = new_data.transpose(trans_cmd)
        
        if self.autograd:
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_" + str(dim))
        return Tensor(new_data)

    # -----------
    # Transpose, Matrix Multiply
    # -----------

    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        """
        Matrix multiply: self.data.dot(x.data)
        If autograd is True, record creation_op="mm" and remember both parents
        """
        if self.autograd:
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self, x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    # -----------
    # Nonlinearities
    # -----------

    def sigmoid(self):
        """
        z = 1/(1 + e^-x)
        creation_op="sigmoid", so we can do chain rule in backward.
        """
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        """
        z = tanh(x), creation_op="tanh".
        We'll do chain rule: dz/dx = 1 - z^2 in backprop.
        """
        if self.autograd:
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    # -----------
    # Printing / Debug
    # -----------

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  


# -----------
# Nonlinearity Layers
# -----------
class Tanh(Layer):
    """
    A layer that applies tanh to its input via input.tanh().
    """
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()
    
class Sigmoid(Layer):
    """
    A layer that applies sigmoid to its input via input.sigmoid().
    """
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()


In [21]:
import numpy
np.random.seed(0)

data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

model = Sequential([Linear(2,3), Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()

optim = SGD(parameters=model.get_parameters(), alpha=1)

for i in range(10):
    
    # Predict
    pred = model.forward(data)
    
    # Compare
    loss = criterion.forward(pred, target)
    
    # Learn
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[1.06372865]
[0.75148144]
[0.57384259]
[0.39574294]
[0.2482279]
[0.15515294]
[0.10423398]
[0.07571169]
[0.05837623]
[0.04700013]


# Part 13: The Embedding Layer

In [22]:
class Embedding(Layer):
    
    def __init__(self, vocab_size, dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dim = dim
        
        # this random initialiation style is just a convention from word2vec
        self.weight = (np.random.rand(vocab_size, dim) - 0.5) / dim
        

# Part 14: Add Indexing to Autograd

![](./images/image%20copy%204.png)

In [23]:
import numpy as np

class Tensor(object):
    """
    A Tensor class supporting auto-differentiation for various operations:
    +, -, *, matrix multiply (mm), sum/expand, transpose, neg, sigmoid, tanh,
    and now index_select for slicing by index while maintaining a computation graph.
    """

    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        """
        Parameters
        ----------
        data : array-like
            The underlying NumPy array data.
        autograd : bool
            Whether to track gradients for this Tensor.
        creators : list of Tensors or None
            Parent Tensors in the computation graph (used in backprop).
        creation_op : str or None
            The operation name that created this Tensor (e.g. 'add', 'mm', 'index_select').
        id : int or None
            Unique identifier for this Tensor (optional; random if not provided).
        """
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None
        
        # Assign or generate a unique ID
        if id is None:
            self.id = np.random.randint(0, 100000)
        else:
            self.id = id
        
        self.creators = creators
        self.creation_op = creation_op
        
        # Dictionary to track how many gradient "signals" this Tensor
        # should receive from child nodes in the graph.
        self.children = {}
        
        # If we have creators, register this Tensor as a child for them
        if creators is not None:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        """
        Check if all expected gradients from child Tensors have arrived.
        Returns True if each child's count is 0 (i.e., no more grads pending).
        """
        for id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True 
        
    def backward(self, grad=None, grad_origin=None):
        """
        Backpropagate from this Tensor, distributing gradients
        to its parents according to the operation that created it.

        Parameters
        ----------
        grad : Tensor or None
            The gradient from further down the graph. If None, we default to
            a Tensor of ones matching self.data's shape.
        grad_origin : Tensor or None
            Which child Tensor the gradient is coming from (helps prevent repeated backprops).
        """
        if self.autograd:

            # If no gradient provided, assume a Tensor of ones
            if grad is None:
                grad = Tensor(np.ones_like(self.data))

            # Decrement the child's expected gradient count
            if grad_origin is not None:
                if self.children[grad_origin.id] == 0:
                    raise Exception("cannot backprop more than once from the same child")
                else:
                    self.children[grad_origin.id] -= 1

            # Accumulate the incoming gradient in self.grad
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            # The incoming gradient must be a leaf (i.e., not requiring autograd itself)
            assert grad.autograd == False
            
            # If all children have sent gradients (or grad_origin is None => direct call),
            # we can continue backprop to our parent(s).
            if (self.creators is not None and
                (self.all_children_grads_accounted_for() or grad_origin is None)):

                # Branch logic depending on the operation type
                if self.creation_op == "add":
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
                if self.creation_op == "sub":
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor((-self.grad).data), self)

                if self.creation_op == "mul":
                    # x * y => grad wrt x is grad*y, wrt y is grad*x
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)                    
                    
                if self.creation_op == "mm":
                    # If z = x.mm(y), then:
                    # dz/dx = grad.mm(y^T), dz/dy = x^T.mm(grad)
                    c0 = self.creators[0]
                    c1 = self.creators[1]
                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)
                    
                if self.creation_op == "transpose":
                    self.creators[0].backward(self.grad.transpose())

                if ("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(
                        self.grad.expand(dim, self.creators[0].data.shape[dim])
                    )

                if ("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                    
                if self.creation_op == "neg":
                    self.creators[0].backward(self.grad.__neg__())
                    
                if self.creation_op == "sigmoid":
                    # z = sigmoid(x) => dz/dx = z*(1 - z)
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                
                if self.creation_op == "tanh":
                    # z = tanh(x) => dz/dx = 1 - z^2
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                
                if self.creation_op == "index_select":
                    # If y = x.index_select(indices),
                    # the gradient is placed back into x at those specific indices.
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))

    # ----------------------
    # Overloaded operators
    # ----------------------
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if self.autograd:
            return Tensor(-self.data,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(-self.data)
    
    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    # ----------------------
    # Summation / Expand
    # ----------------------
    def sum(self, dim):
        """
        Sum along the specified dimension 'dim'.
        creation_op = "sum_<dim>" so we can reconstruct shapes in backprop.
        """
        if self.autograd:
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_" + str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim, copies):
        """
        Expand (repeat) the data along dimension 'dim' 'copies' times.
        Used in e.g. broadcasting or matching shapes.
        """
        trans_cmd = list(range(len(self.data.shape)))
        trans_cmd.insert(dim, len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies])
        new_data = new_data.transpose(trans_cmd)
        
        if self.autograd:
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_" + str(dim))
        return Tensor(new_data)

    # ----------------------
    # Transpose / MatMul
    # ----------------------
    def transpose(self):
        """
        Returns a transposed version of this Tensor's data.
        """
        if self.autograd:
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        """
        Matrix multiply (dot product).
        If autograd, creation_op='mm' to handle backprop correctly.
        """
        if self.autograd:
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self, x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))

    # ----------------------
    # Nonlinearities
    # ----------------------
    def sigmoid(self):
        """
        Sigmoid activation => 1/(1 + e^-data).
        creation_op='sigmoid' for chain rule derivative = z*(1 - z).
        """
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        """
        Tanh activation => np.tanh(data).
        creation_op='tanh' so derivative can be handled => 1 - z^2.
        """
        if self.autograd:
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op="tanh")
        return Tensor(np.tanh(self.data))

    # ----------------------
    # Indexing
    # ----------------------
    def index_select(self, indices):
        """
        Return a new Tensor by indexing this Tensor at the specified 'indices'.
        If autograd is enabled, record 'index_select' so we can route gradient
        back to the correct positions in the original Tensor.
        """
        if self.autograd:
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])

    # ----------------------
    # String/Debug
    # ----------------------
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  


# Example Nonlinear Layers (for completeness)
class Tanh(Layer):
    """
    A layer that applies tanh() to its input.
    """
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()
    
class Sigmoid(Layer):
    """
    A layer that applies sigmoid() to its input.
    """
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()


In [24]:
x = Tensor(np.eye(5), autograd=True)
x.index_select(Tensor([[1,2,3],[2,3,4]])).backward()
print(x.grad)

[[0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1.]]


# Part 15: The Embedding Layer (revisited)

In [25]:
class Embedding(Layer):
    
    def __init__(self, vocab_size, dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dim = dim
        
        # this random initialiation style is just a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)
        
        self.parameters.append(self.weight)
    
    def forward(self, input):
        return self.weight.index_select(input)

In [26]:
import numpy
np.random.seed(0)

data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

embed = Embedding(5,3)
model = Sequential([embed, Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.5)

for i in range(10):
    
    # Predict
    pred = model.forward(data)
    
    # Compare
    loss = criterion.forward(pred, target)
    
    # Learn
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[0.98874126]
[0.6658868]
[0.45639889]
[0.31608168]
[0.2260925]
[0.16877423]
[0.13120515]
[0.10555487]
[0.08731868]
[0.07387834]


# Part 16: The Cross Entropy Layer

![](./images/image%20copy%203.png)

In [27]:
import numpy as np

class Tensor(object):
    """
    A Tensor class supporting auto-differentiation for various operations:
    +, -, *, matrix multiply (mm), sum/expand, transpose, neg, sigmoid, tanh,
    index_select, and now cross_entropy for classification tasks.
    """

    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):
        """
        Parameters
        ----------
        data : array-like
            NumPy array storing numerical values for this Tensor.
        autograd : bool
            If True, we track gradients (auto-diff).
        creators : list or None
            Parent Tensors involved in creating this one.
        creation_op : str or None
            The operation name that led to this Tensor's creation (e.g., "add", "cross_entropy").
        id : int or None
            Unique ID for the Tensor. If None, a random ID is assigned.
        """
        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None

        # Assign or generate an ID
        if id is None:
            self.id = np.random.randint(0, 100000)
        else:
            self.id = id
        
        self.creators = creators
        self.creation_op = creation_op
        # children[id] = how many grads we expect from that child
        self.children = {}
        
        # If we have parent Tensors, register this as a child to them
        if creators is not None:
            for c in creators:
                if self.id not in c.children:
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        """
        Check if we've received gradients from all child Tensors.
        Returns True if none are pending.
        """
        for id, cnt in self.children.items():
            if cnt != 0:
                return False
        return True 
        
    def backward(self, grad=None, grad_origin=None):
        """
        The main backprop method. Takes an incoming gradient (grad) from a child,
        accumulates it in self.grad, and then, if all child grads are in,
        applies chain rule logic to pass gradients back to parent Tensors.
        """
        if self.autograd:

            # Default to a Tensor of ones if no grad is provided
            if grad is None:
                grad = Tensor(np.ones_like(self.data))

            # Decrement child's expected gradient count
            if grad_origin is not None:
                if self.children[grad_origin.id] == 0:
                    raise Exception("cannot backprop more than once from the same child")
                else:
                    self.children[grad_origin.id] -= 1

            # Accumulate gradient in self.grad
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            # The incoming gradient should not require grad
            assert grad.autograd == False
            
            # If all child gradients are accounted for or there's a direct call:
            if (self.creators is not None
                and (self.all_children_grads_accounted_for() or grad_origin is None)):

                # Apply chain rule depending on creation_op
                if self.creation_op == "add":
                    # grad splits to both parents unchanged
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)
                    
                elif self.creation_op == "sub":
                    # x - y => partial wrt x is +grad, wrt y is -grad
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor((-self.grad).data), self)

                elif self.creation_op == "mul":
                    # x * y => partial wrt x is grad * y, wrt y is grad * x
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)                    
                    
                elif self.creation_op == "mm":
                    # z = x.mm(y)
                    # partial wrt x => grad.mm(y^T), wrt y => x^T.mm(grad)
                    c0 = self.creators[0]
                    c1 = self.creators[1]

                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)
                    
                elif self.creation_op == "transpose":
                    # partial wrt x => grad.transpose()
                    self.creators[0].backward(self.grad.transpose())

                elif ("sum" in self.creation_op):
                    # sum_<dim>
                    dim = int(self.creation_op.split("_")[1])
                    expanded = self.grad.expand(dim, self.creators[0].data.shape[dim])
                    self.creators[0].backward(expanded)

                elif ("expand" in self.creation_op):
                    # partial wrt x => grad.sum(dim)
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
                    
                elif self.creation_op == "neg":
                    # partial wrt x => -grad
                    self.creators[0].backward(self.grad.__neg__())
                    
                elif self.creation_op == "sigmoid":
                    # z = sigmoid(x), partial wrt x => grad * z*(1-z)
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                
                elif self.creation_op == "tanh":
                    # z = tanh(x), partial wrt x => grad * (1 - z^2)
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                
                elif self.creation_op == "index_select":
                    # route grad back to correct indices
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = self.grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))
                    
                elif self.creation_op == "cross_entropy":
                    # Cross entropy with softmax:
                    # partial wrt input => (softmax_output - target_dist)
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))

    # ----------------------
    # Overloaded Operators
    # ----------------------

    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if self.autograd:
            return Tensor(-self.data,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(-self.data)
    
    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    # ----------------------
    # Summation / Expand
    # ----------------------
    def sum(self, dim):
        """
        Summation along dimension 'dim'. We'll store 'sum_<dim>' in creation_op
        to handle backprop shape expansions.
        """
        if self.autograd:
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_" + str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim, copies):
        """
        Expand (repeat) data along dimension 'dim' 'copies' times,
        helpful for broadcasting-like operations.
        """
        trans_cmd = list(range(len(self.data.shape)))
        trans_cmd.insert(dim, len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies])
        new_data = new_data.transpose(trans_cmd)
        
        if self.autograd:
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_" + str(dim))
        return Tensor(new_data)

    # ----------------------
    # Transpose / MatMul
    # ----------------------
    def transpose(self):
        """
        Return a transposed view of this Tensor's data.
        """
        if self.autograd:
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        """
        Matrix multiply: self.data.dot(x.data).
        """
        if self.autograd:
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self, x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))

    # ----------------------
    # Nonlinearities
    # ----------------------
    def sigmoid(self):
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    # ----------------------
    # Indexing
    # ----------------------
    def index_select(self, indices):
        """
        Return a new Tensor by indexing self at positions given by 'indices'.
        """
        if self.autograd:
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])

    # ----------------------
    # Cross Entropy
    # ----------------------
    def cross_entropy(self, target_indices):
        """
        Cross entropy loss with softmax:
        1) Compute softmax of self.data
        2) Gather targets from 'target_indices'
        3) Compute loss = - log(prob_of_true_class), averaged over batch
        4) In backprop, we do (softmax_output - one_hot_targets)
        """
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape) - 1,
                                       keepdims=True)
        
        # Flattening target indices for simple indexing
        t = target_indices.data.flatten()
        # Reshape the softmax output to match: (batch_size, num_classes)
        p = softmax_output.reshape(len(t), -1)
        # Build one-hot target distribution
        target_dist = np.eye(p.shape[1])[t]
        
        # Cross entropy: -sum( log(prob_of_true_class) ), average over batch
        loss = -(np.log(p) * target_dist).sum(1).mean()
    
        if self.autograd:
            out = Tensor(loss,
                         autograd=True,
                         creators=[self],
                         creation_op="cross_entropy")
            # Store the softmax distribution and the target distribution
            # so we can do backprop: grad = (softmax_output - target_dist)
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out

        # If not autograd, return a plain Tensor with no backprop references
        return Tensor(loss)
        
    # ----------------------
    # String / Debug
    # ----------------------
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())  


# Optional Nonlinear Layers (if needed)
class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()
    
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()


In [28]:
class CrossEntropyLoss(object):
    
    def __init__(self):
        super().__init__()
    
    def forward(self, input, target):
        return input.cross_entropy(target)

In [29]:
import numpy
np.random.seed(0)

# data indices
data = Tensor(np.array([1,2,1,2]), autograd=True)

# target indices
target = Tensor(np.array([0,1,0,1]), autograd=True)

model = Sequential([Embedding(3,3), Tanh(), Linear(3,4)])
criterion = CrossEntropyLoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.1)

for i in range(10):
    
    # Predict
    pred = model.forward(data)
    
    # Compare
    loss = criterion.forward(pred, target)
    
    # Learn
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

1.3885032434928422
0.9558181509266036
0.6823083585795604
0.509525996749312
0.39574491472895856
0.3175252728534828
0.26172228619642157
0.22061283923954234
0.18946427334830074
0.16527389263866676


# Part 17: The Recurrent Neural Network Layer

![](./images/image%20copy%202.png)

In [30]:
class RNNCell(Layer):
    """
    A single RNN cell that processes one time step of the data.
    It takes the current input and the previous hidden state,
    and outputs the new hidden state along with an output vector.
    
    Formally:
      hidden_t = activation( input_t * W_ih + hidden_(t-1) * W_hh )
      output_t = hidden_t * W_ho
    """

    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        """
        Parameters
        ----------
        n_inputs : int
            Dimensionality of each input vector.
        n_hidden : int
            Size of the hidden state (number of hidden units).
        n_output : int
            Size of the output vector for each time step.
        activation : str
            Which nonlinear function to apply in the hidden update ('sigmoid' or 'tanh').
        """
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        # Choose either Sigmoid or Tanh for the hidden state activation
        if activation == 'sigmoid':
            self.activation = Sigmoid()
        elif activation == 'tanh':
            self.activation = Tanh()
        else:
            raise Exception("Non-linearity not found. Use 'sigmoid' or 'tanh'.")

        # Linear transformations:
        # w_ih: transforms input vector to hidden dimension
        self.w_ih = Linear(n_inputs, n_hidden)
        # w_hh: transforms previous hidden to next hidden
        self.w_hh = Linear(n_hidden, n_hidden)
        # w_ho: transforms hidden state to output dimension
        self.w_ho = Linear(n_hidden, n_output)
        
        # Add all parameters (weights and biases) to self.parameters for easy access
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()
    
    def forward(self, input, hidden):
        """
        Forward pass for one time step.

        Parameters
        ----------
        input : Tensor
            The input vector for this time step (batch_size, n_inputs).
        hidden : Tensor
            The previous hidden state (batch_size, n_hidden).
        
        Returns
        -------
        output : Tensor
            The output vector for this time step (batch_size, n_output).
        new_hidden : Tensor
            The new hidden state (batch_size, n_hidden).
        """
        # Transform the previous hidden state: hidden_(t-1) => next hidden
        from_prev_hidden = self.w_hh.forward(hidden)
        
        # Transform the input: input_t => hidden dimension
        from_input = self.w_ih.forward(input)
        
        # Combine them by addition, then apply the activation
        combined = from_input + from_prev_hidden
        new_hidden = self.activation.forward(combined)

        # Finally, transform hidden => output dimension
        output = self.w_ho.forward(new_hidden)
        
        return output, new_hidden
    
    def init_hidden(self, batch_size=1):
        """
        Initialize a zero hidden state for a given batch size.

        Returns
        -------
        hidden : Tensor
            A (batch_size, n_hidden)-shaped Tensor of zeros.
        """
        return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)


![](./images/image.png)

![](./images/image%20copy.png)

In [31]:
import sys, random, math
from collections import Counter
import numpy as np

# --------------------------------------------------------------------
# 1) DATA LOADING AND PREPARATION
# --------------------------------------------------------------------
f = open('tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

# 'raw' contains lines of text in a bAbI format.
# We take only the first 1000 lines for demonstration.

tokens = list()
for line in raw[0:1000]:
    # Lowercase each line, strip newline, split on spaces.
    # Then skip the first token (the bAbI line number) with [1:].
    tokens.append(line.lower().replace("\n","").split(" ")[1:])

# 'tokens' is now a list of lists of words.

# We'll ensure each line has length 6 (arbitrary cutoff),
# padding with '-' if it's shorter.
new_tokens = list()
for line in tokens:
    new_tokens.append(['-'] * (6 - len(line)) + line)
tokens = new_tokens

# 2) BUILD VOCABULARY
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)
vocab = list(vocab)

# Build a mapping from word -> integer index
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

# A small helper function to convert words to indices
def words2indices(sentence):
    return [word2index[w] for w in sentence]

# Convert every line in 'tokens' to a list of indices
indices = []
for line in tokens:
    idx = [word2index[w] for w in line]
    indices.append(idx)

# Convert to a NumPy array for easy slicing
data = np.array(indices)

# --------------------------------------------------------------------
# 3) BUILD MODEL COMPONENTS (Embedding + RNN + Criterion)
# --------------------------------------------------------------------
# We'll embed each token index into a 16-dimensional vector
embed = Embedding(vocab_size=len(vocab), dim=16)

# RNNCell: input dimension 16, hidden dimension 16, output dimension = vocab size
model = RNNCell(n_inputs=16, n_hidden=16, n_output=len(vocab))

# Cross-entropy loss for classification
criterion = CrossEntropyLoss()

# Define an optimizer (Stochastic Gradient Descent)
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(),
            alpha=0.05)

# --------------------------------------------------------------------
# 4) TRAINING LOOP
# --------------------------------------------------------------------
for iter in range(1000):
    batch_size = 100
    total_loss = 0
    
    # Initialize hidden state to zeros for batch_size=100
    hidden = model.init_hidden(batch_size=batch_size)

    # We'll process the first 5 tokens in each line, step by step
    for t in range(5):
        # Take token at position t for each line => shape (batch_size,)
        input = Tensor(data[0:batch_size, t], autograd=True)
        
        # Convert these indices into embeddings => shape (batch_size, 16)
        rnn_input = embed.forward(input=input)
        
        # RNN forward: produce an output and the new hidden state
        output, hidden = model.forward(input=rnn_input, hidden=hidden)

    # After feeding 5 tokens, we want to predict the 6th (t+1).
    # So the target is the token at column 't+1'
    target = Tensor(data[0:batch_size, t+1], autograd=True)
    
    # Compute cross-entropy loss (with softmax inside cross_entropy)
    loss = criterion.forward(output, target)
    
    # Backprop: This calls .backward() all the way through the RNN + embedding
    loss.backward()
    
    # Update parameters with a step of gradient descent
    optim.step()
    
    total_loss += loss.data

    # Periodically print training stats
    if (iter % 200) == 0:
        # 'p_correct' = average fraction of correct predictions in this batch
        p_correct = (target.data == np.argmax(output.data, axis=1)).mean()
        print("Loss:", total_loss / (len(data)/batch_size), "% Correct:", p_correct)

# --------------------------------------------------------------------
# 5) TEST / DEMO
# --------------------------------------------------------------------
batch_size = 1
hidden = model.init_hidden(batch_size=batch_size)

# We'll feed the first 5 tokens of the line (data[0]) one by one
for t in range(5):
    input = Tensor(data[0:batch_size, t], autograd=True)
    rnn_input = embed.forward(input=input)
    output, hidden = model.forward(input=rnn_input, hidden=hidden)

# The last token in that line is the target
target = Tensor(data[0:batch_size, t+1], autograd=True)
loss = criterion.forward(output, target)

# Print out the context, the true next token, and the predicted token
ctx = ""
for idx in data[0:batch_size][0][0:-1]:
    ctx += vocab[idx] + " "

print("Context:", ctx)
print("True:", vocab[target.data[0]])
print("Pred:", vocab[output.data.argmax()])


Loss: 0.4767529043186908 % Correct: 0.0
Loss: 0.17431028405025537 % Correct: 0.27
Loss: 0.1614921436370119 % Correct: 0.33
Loss: 0.14318897302930267 % Correct: 0.34
Loss: 0.1368414224697062 % Correct: 0.37
Context: - mary moved to the 
True: bathroom.
Pred: garden.
