# Generating Data w/ Numpy

In [1]:
import numpy as np

### np.empty(), np.zeros(), np.ones(), np.full()

1. `np.empty(shape)`: creates a new array with a specified shape and data type, without initializing elements. 

    (+) makes function fast
    (-) allocates memory to array, usually unused (0) but sometimes used, making the output inconsistent

    It is mainly used when performance is critical, and the array will be fully overwritten later.


    When consistent output is more important than performance:

2. `np.zeros(shape)`: Creates an array of specified shape filled with zeros.

3. `np.ones(shape)`: Creates an array of specified shape filled with ones. 

4. `np.full(shape, fill_value)`: Creates an array of specified shape filled with a custom constant value. 

In [8]:
array_empty = np.empty(shape = (2,3)) 
array_empty 

array([[0., 0., 0.],
       [0., 0., 0.]])

In [7]:
array_zeros = np.zeros(shape = (2,3))
array_zeros

array([[0., 0., 0.],
       [0., 0., 0.]])

In [13]:
array_ones = np.ones(shape = (2,3), dtype = "int")
array_ones

array([[1, 1, 1],
       [1, 1, 1]])

In [11]:
array_full = np.full(shape = (2,3), fill_value = 3) # One additional mandatory argument - fill_value -> scalar
array_full

array([[3, 3, 3],
       [3, 3, 3]])

In [14]:
array_full = np.full(shape = (2,3), fill_value = "Hello")
array_full

array([['Hello', 'Hello', 'Hello'],
       ['Hello', 'Hello', 'Hello']], dtype='<U5')

### "_like" functions

The "_like" functions in NumPy (e.g., np.empty_like, np.zeros_like, np.ones_like, np.full_like) create new arrays based on the shape and data type of an existing array, while optionally customizing the values.

In [15]:
matrix_A = np.array([[1,9,8,5,0],[2,23,56,8,90],[12,8,67,0,4]])
matrix_A

array([[ 1,  9,  8,  5,  0],
       [ 2, 23, 56,  8, 90],
       [12,  8, 67,  0,  4]])

In [16]:
array_empty_like = np.empty_like(matrix_A)
array_empty_like

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [18]:
array_zeros_like = np.zeros_like(matrix_A)
array_zeros_like

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [17]:
array_ones_like = np.ones_like(matrix_A)
array_ones_like

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [19]:
array_full_like = np.full_like(matrix_A, fill_value = 9)
array_full_like

array([[9, 9, 9, 9, 9],
       [9, 9, 9, 9, 9],
       [9, 9, 9, 9, 9]])

### np.arange()

Creates sequence of consecutive integer values within specified range [start, stop).

Equivalent to range function in Python, except that it does not return a range object but an array.

In [20]:
range(30)

# range(30) results in a range object.
# list(range(30)) creates a list with all the values in this range.

range(0, 30)

In [21]:
list(range(30))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [22]:
## Creates an ndarray with the values in this range.
array_rng = np.arange(30)
array_rng

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [24]:
array_rng = np.arange(start = 7, stop = 30)
array_rng

array([ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
       24, 25, 26, 27, 28, 29])

In [28]:
array_rng = np.arange(start = 0, stop = 30, step = 2.5) # 30/2.5 = 12 elements
array_rng

array([ 0. ,  2.5,  5. ,  7.5, 10. , 12.5, 15. , 17.5, 20. , 22.5, 25. ,
       27.5])

In [27]:
array_rng = np.arange(start = 0, stop = 30, step = 2.5, dtype = np.int32) # function works only in realm of integers
array_rng # so step becomes '2' & functions assumes we want same number of 12 elements as in original function

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22], dtype=int32)

## Random Generators

### Defining Random Generators

In [29]:
## We load two functions from the numpy.random module.
from numpy.random import PCG64 as pcg # A bit generator produces a sequence of random bits, such as 101011
from numpy.random import Generator as gen # These bits are taken as input by a generator and converted to a random number

# example: 
# bit generator --> Random bits: 101011 (binary)
# Generator --> Converted integer: (1 × 2**5) + (0 × 2**4) + (1 × 2**3) + (0 × 2**2) + (1 × 2**1) + (1 × 2**0) = 43 (decimal)


In [30]:
# RG is short for Random Generator
array_RG = gen(pcg()) # make a generator object that randomly initializes & generates numbers
array_RG.normal() # make generator object pick numbers from normal distribution

-0.4725734908192582

In [32]:
array_RG = gen(pcg())
array_RG.normal(size = (2,4))

array([[ 1.22151491, -1.20447523, -0.93347016, -0.39527021],
       [-0.1601056 ,  0.95612318,  0.40794609,  0.25033737]])

In [34]:
array_RG = gen(pcg(seed = 365)) # specify seed to allow replication (semi-random data)
array_RG.normal(size = (2,4))

array([[-0.13640899,  0.09414431, -0.06300442,  1.05391641],
       [-0.6866818 , -0.50922173, -0.7999526 ,  0.73041825]])

In [35]:
array_RG = gen(pcg(seed = 365)) # exact replication
array_RG.normal(size = (2,4))

array([[-0.13640899,  0.09414431, -0.06300442,  1.05391641],
       [-0.6866818 , -0.50922173, -0.7999526 ,  0.73041825]])

In [36]:
array_RG.normal(size = (2,4)) # however: seed only lasts for 1 execution before it is reset
# so these random numbers are again different from the ones when we specified the seed

array([[ 0.08825439, -2.1177576 ,  0.65526774, -0.48095012],
       [-0.5519114 , -0.58578662, -0.98257896,  1.12378166]])

In [37]:
array_RG = gen(pcg(seed = 365)) # again exact replication when we specify seed again
array_RG.normal(size = (2,4))

array([[-0.13640899,  0.09414431, -0.06300442,  1.05391641],
       [-0.6866818 , -0.50922173, -0.7999526 ,  0.73041825]])

### Generating Integers, Probabilities and Random Choices

In [None]:
array_RG = gen(pcg(seed = 365)) # re-initialize generator object at seed 365

In [38]:
array_RG.integers(10, size = (2,4)) # generator method creating integers within specified range within array with specified shape
# starts at 0 if only one number is specified; runs over closed-open interval: excluding specified number

array([[1, 8, 7, 4],
       [4, 8, 6, 4]])

In [39]:
array_RG.integers(low = 10, high = 99, size = (2,4)) # with specified lower boundary

array([[91, 16, 79, 71],
       [19, 42, 23, 36]])

In [40]:
array_RG.random(size = (2,4)) # generator method creating random probabilities (between 0 & 1) within array with specified shape

array([[0.4996155 , 0.4865245 , 0.62740703, 0.54952637],
       [0.64894629, 0.04411757, 0.7206516 , 0.84594003]])

In [41]:
array_RG.choice([1,2,3,4,5], size = (2,4)) # all values randomly chosen from specified list

array([[2, 1, 5, 4],
       [5, 2, 4, 3]])

In [43]:
array_RG.choice([1,2,3,4,5], p = [0.6, 0.1, 0.1, 0.1, 0.1], size = (2,4)) # make sure probabilities add up to 1!
# also possible to set probabilities of values different from default = 0.5

array([[1, 3, 1, 1],
       [4, 1, 1, 1]])

### Generating Arrays From Known Distributions

In [44]:
array_RG = gen(pcg(seed = 365))
array_RG.poisson(size = (5,5))
# The default Poisson distribution

array([[2, 0, 1, 1, 2],
       [1, 1, 0, 1, 1],
       [1, 2, 1, 1, 0],
       [0, 1, 0, 2, 1],
       [0, 1, 0, 0, 2]])

In [45]:
array_RG = gen(pcg(seed = 365))
array_RG.poisson(lam = 10, size = (5,5))
# Specifying lambda

array([[11, 12, 12, 14, 13],
       [ 9, 10, 11, 11,  8],
       [11,  8, 10,  9, 14],
       [ 7,  8,  9, 15, 15],
       [13,  8,  8,  7,  9]])

In [47]:
array_RG = gen(pcg(seed = 365))
array_RG.binomial(n = 100, p = 0.4, size = (5,5))
# A binomial distribution with p = 0.4 and 100 trials

array([[42, 44, 30, 36, 45],
       [36, 41, 38, 42, 41],
       [35, 31, 35, 46, 29],
       [41, 41, 46, 34, 48],
       [45, 45, 45, 40, 43]])

In [48]:
array_RG = gen(pcg(seed = 365))
array_RG.logistic(loc = 9, scale = 1.2, size = (5,5))
# A logistic distribution with a location = 9 and scale = 1.2

array([[10.37767822, 10.42451863,  9.63404367,  7.36153427,  9.82286787],
       [ 5.81223125, 10.09354231,  6.46790532, 11.38740256,  8.97147918],
       [10.85844698,  8.79081317,  5.962079  ,  9.99560681,  8.34539118],
       [ 7.97105522,  8.9981544 ,  8.93530194,  9.6253307 ,  9.23850869],
       [ 9.73729284,  5.3090678 , 10.13723528, 11.04372782,  7.11078651]])

https://numpy.org/doc/stable/reference/random/generator

### Applications of Random Generators

In [49]:
# Test creation: create pseudo-random data in CSV to test how well program performs

array_RG = gen(pcg(seed = 365)) 

array_column_1 = array_RG.normal(loc = 2, scale = 3, size = (1000))
array_column_2 = array_RG.normal(loc = 7, scale = 2, size = (1000))
array_column_3 = array_RG.logistic(loc = 11, scale = 3, size = (1000))
array_column_4  = array_RG.exponential(scale = 4, size = (1000))
array_column_5  = array_RG.geometric(p = 0.7, size = (1000))

# Create the individual columns of the dataset we're creating. 

In [50]:
#random_test_data = np.array([array_column_1, array_column_2, array_column_3, array_column_4, array_column_5]).transpose()
random_test_data = np.array([array_column_1, array_column_2, array_column_3, array_column_4, array_column_5]).transpose()
random_test_data

# Use np.array to generate a new array with the 5 arrays we created earlier. 
# Use the transpose method to make sure our dataset isn't flipped. 

array([[ 1.59077303,  6.42174295, 10.14698427,  6.91500737,  1.        ],
       [ 2.28243293,  8.57902322, 15.93309953,  6.243605  ,  1.        ],
       [ 1.81098674,  5.17270135, -0.46878789,  2.44997251,  1.        ],
       ...,
       [ 0.1973629 ,  4.3465854 ,  2.66485989,  0.80935387,  1.        ],
       [-2.21015722,  8.2176402 , 12.69328115,  0.50644607,  2.        ],
       [ 2.91161235,  7.90337695, 11.79840961,  4.86816939,  1.        ]])

In [51]:
random_test_data.shape

(1000, 5)

In [52]:
np.savetxt("Random-Test-from-NumPy.csv", random_test_data, fmt = '%s', delimiter = ',')


# Saving the arrays to an extrenal file we're creating. 

# file name -> "Random-Test-from-NumPy.csv"
# random_test_data -> data we're exporting (saving to an external file)
# format -> strings
# delimiter ","

# We'll talk more about these in just a bit. 

In [53]:
np.genfromtxt("Random-Test-from-NumPy.csv", delimiter = ',')

# Importing the data from the file we just created. 

array([[ 1.59077303,  6.42174295, 10.14698427,  6.91500737,  1.        ],
       [ 2.28243293,  8.57902322, 15.93309953,  6.243605  ,  1.        ],
       [ 1.81098674,  5.17270135, -0.46878789,  2.44997251,  1.        ],
       ...,
       [ 0.1973629 ,  4.3465854 ,  2.66485989,  0.80935387,  1.        ],
       [-2.21015722,  8.2176402 , 12.69328115,  0.50644607,  2.        ],
       [ 2.91161235,  7.90337695, 11.79840961,  4.86816939,  1.        ]])

In [54]:
rand_test_data = np.genfromtxt("Random-Test-from-NumPy.csv", delimiter = ',')
print(rand_test_data)

[[ 1.59077303  6.42174295 10.14698427  6.91500737  1.        ]
 [ 2.28243293  8.57902322 15.93309953  6.243605    1.        ]
 [ 1.81098674  5.17270135 -0.46878789  2.44997251  1.        ]
 ...
 [ 0.1973629   4.3465854   2.66485989  0.80935387  1.        ]
 [-2.21015722  8.2176402  12.69328115  0.50644607  2.        ]
 [ 2.91161235  7.90337695 11.79840961  4.86816939  1.        ]]
