# 1Ô∏è‚É£ What NumPy actually is (not marketing fluff)
NumPy = fast, memory-efficient, vectorized numerical computation in Python.

In [None]:
# Installation
pip install numpy

In [1]:
# If you write loops on large data instead of NumPy operations ‚Üí you are doing it wrong and slow.

import numpy as np  # import

# 2Ô∏è‚É£ Creating arrays (the only data structure that matters here)

In [2]:
a = np.array([1, 2, 3])
b = np.array([[1,2,3],[4,5,6]])
print(a,b)

[1 2 3] [[1 2 3]
 [4 5 6]]


In [3]:
# Built-ins you must know:
print(np.zeros((2,3)))
print(np.ones((3,3)))
print(np.eye(3))          # identity matrix
print(np.arange(0,10,2))
print(np.linspace(0,1,5))
print(np.random.rand(3,3))

[[0. 0. 0.]
 [0. 0. 0.]]
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[0 2 4 6 8]
[0.   0.25 0.5  0.75 1.  ]
[[0.79111472 0.32034126 0.01047303]
 [0.22684277 0.327331   0.89862449]
 [0.8568238  0.72096029 0.63163826]]


# 3Ô∏è‚É£ Shape, type, memory (fundamentals people skip)

In [4]:
print(a.shape)
print(a.ndim)
print(a.size)
print(a.dtype)

(3,)
1
3
int64


In [5]:
# Changing shape:
print(a.reshape(3,1))
print(a.flatten())
print(a.T)   # transpose

[[1]
 [2]
 [3]]
[1 2 3]
[1 2 3]


# 4Ô∏è‚É£ Indexing & slicing (where most bugs come from)

In [6]:
b=np.array([[1,2,3],[4,5,6],[7,8,9]])
print(a[0])
print(a[1:4])
print(b[:,1])
print(b[0,2])

1
[2 3]
[2 5 8]
3


In [7]:
# Boolean indexing (critical):
print(b[b > 5])

[6 7 8 9]


In [8]:
# Fancy indexing:
print(b[[0,2,1]])

[[1 2 3]
 [7 8 9]
 [4 5 6]]


# 5Ô∏è‚É£ Vectorized operations (why NumPy exists)

In [9]:
# Comparison:
print(a > 5)
print(b > 5)
print((a > 5).sum())
print((b > 5).sum())

[False False False]
[[False False False]
 [False False  True]
 [ True  True  True]]
0
4


In [10]:
print(np.sum(a))
print(np.mean(a))
print(np.min(a))
print(np.max(a))
print(np.std(a))
print(np.sqrt(a))
print(np.log(a))

6
2.0
1
3
0.816496580927726
[1.         1.41421356 1.73205081]
[0.         0.69314718 1.09861229]


# 7Ô∏è‚É£ Broadcasting (this is where you level up)

In [11]:
# Broadcasting examples
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([10, 20, 30])
print(a + b)  # 1x3 added across rows
scale = np.array([[1], [10]])
print(a * scale)  # column vector scales each row

[[11 22 33]
 [14 25 36]]
[[ 1  2  3]
 [40 50 60]]


# 8Ô∏è‚É£ Combine & split arrays quickly

In [12]:
c1 = np.array([[1, 2], [3, 4]])
c2 = np.array([[5, 6], [7, 8]])
print(np.concatenate([c1, c2], axis=0))  # stack rows
print(np.concatenate([c1, c2], axis=1))  # stack columns
print(np.vstack([c1, c2]))
print(np.hstack([c1, c2]))
print(np.split(np.arange(6), 3))  # even splits

[[1 2]
 [3 4]
 [5 6]
 [7 8]]
[[1 2 5 6]
 [3 4 7 8]]
[[1 2]
 [3 4]
 [5 6]
 [7 8]]
[[1 2 5 6]
 [3 4 7 8]]
[array([0, 1]), array([2, 3]), array([4, 5])]


# 9Ô∏è‚É£ Sorting, uniques, and args

In [13]:
x = np.array([3, 1, 2, 2, 5])
print(np.sort(x))
print(np.unique(x, return_counts=True))  # values and counts
print(x.argsort())  # indices that would sort
print(x[np.argsort(x)])

[1 2 2 3 5]
(array([1, 2, 3, 5]), array([1, 2, 1, 1]))
[1 2 3 0 4]
[1 2 2 3 5]


# üîü Quick workflow: standardize columns

In [14]:
data = np.arange(12).reshape(3, 4)
col_means = data.mean(axis=0)
col_std = data.std(axis=0)
standardized = (data - col_means) / col_std
print(standardized)

[[-1.22474487 -1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487  1.22474487]]


In [15]:
# Bonus: random seeds for reproducibility
np.random.seed(42)
print(np.random.rand(2, 3))
print(np.random.normal(loc=0, scale=1, size=(2, 3)))

[[0.37454012 0.95071431 0.73199394]
 [0.59865848 0.15601864 0.15599452]]
[[ 1.57921282  0.76743473 -0.46947439]
 [ 0.54256004 -0.46341769 -0.46572975]]


# 1Ô∏è‚É£1Ô∏è‚É£ Save and load arrays quickly

In [16]:
arr = np.arange(6).reshape(2, 3)
np.save('tmp_array.npy', arr)           # binary, fast, keeps dtype/shape
loaded = np.load('tmp_array.npy')
print(loaded)
np.savetxt('tmp_array.csv', arr, delimiter=',')  # human-readable
loaded_txt = np.loadtxt('tmp_array.csv', delimiter=',')
print(loaded_txt)

[[0 1 2]
 [3 4 5]]
[[0. 1. 2.]
 [3. 4. 5.]]


# 1Ô∏è‚É£2Ô∏è‚É£ Small performance checklist

- Prefer vectorized ops over Python loops
- Preallocate arrays instead of appending in a loop
- Use `axis` arguments for reductions; avoid manual reshape + sum
- For large random draws, set `np.random.seed` once for reproducibility
- Profile with small slices first; then scale up

# 1Ô∏è‚É£3Ô∏è‚É£ Views vs. copies (bug finder)

In [17]:
base = np.arange(5)
view = base[1:4]      # view shares memory
copy = base[1:4].copy()
view[0] = 99
copy[0] = 42
print('base after view change:', base)
print('view:', view)
print('copy:', copy)

base after view change: [ 0 99  2  3  4]
view: [99  2  3]
copy: [42  2  3]


# 1Ô∏è‚É£4Ô∏è‚É£ NaN-aware stats

In [18]:
data = np.array([1, 2, np.nan, 4])
print('mean ignoring NaN:', np.nanmean(data))
print('sum ignoring NaN:', np.nansum(data))
print('isnan mask:', np.isnan(data))
filled = np.nan_to_num(data, nan=0.0)
print('filled:', filled)

mean ignoring NaN: 2.3333333333333335
sum ignoring NaN: 7.0
isnan mask: [False False  True False]
filled: [1. 2. 0. 4.]


# 1Ô∏è‚É£5Ô∏è‚É£ Matrix ops you actually need

In [19]:
m1 = np.array([[1, 2], [3, 4]])
m2 = np.array([[5, 6], [7, 8]])
print('matmul:', m1 @ m2)
print('dot:', np.dot(m1, m2))
print('transpose:', m1.T)
print('inverse:', np.linalg.inv(m1))
print('eigenvalues:', np.linalg.eigvals(m1))
print('norm:', np.linalg.norm(m1))

matmul: [[19 22]
 [43 50]]
dot: [[19 22]
 [43 50]]
transpose: [[1 3]
 [2 4]]
inverse: [[-2.   1. ]
 [ 1.5 -0.5]]
eigenvalues: [-0.37228132  5.37228132]
norm: 5.477225575051661


# 1Ô∏è‚É£6Ô∏è‚É£ When to reach for pandas instead

- Tabular data with mixed dtypes, missing values, joins, and group-bys ‚Üí use pandas
- NumPy shines for dense numeric arrays and math-heavy pipelines
- Converting: `df.to_numpy()` to drop into NumPy; `pd.DataFrame(arr)` to wrap an array

# 1Ô∏è‚É£7Ô∏è‚É£ Debugging shape errors fast

In [20]:
x = np.random.rand(2, 3)
y = np.random.rand(3)
print('x shape', x.shape)
print('y shape', y.shape)
try:
    print(x + y)  # works: y broadcasts across columns
except ValueError as e:
    print('error:', e)
z = np.random.rand(2)
print('z shape', z.shape)
try:
    print(x + z)  # fails: trailing dims mismatch
except ValueError as e:
    print('error:', e)
print('fix with new axis:', x + z[:, None])

x shape (2, 3)
y shape (3,)
[[0.59638689 0.89111827 0.888015  ]
 [0.58337379 0.97821474 0.59556384]]
z shape (2,)
error: operands could not be broadcast together with shapes (2,3) (2,) 
fix with new axis: [[1.0894182  1.30993239 1.21712098]
 [0.49090292 0.81152668 0.33916764]]


# 1Ô∏è‚É£8Ô∏è‚É£ Dtype & precision gotchas

In [21]:
ints = np.array([1, 2, 3])
floats = np.array([0.1, 0.2, 0.3], dtype=np.float32)
print('ints dtype:', ints.dtype)
print('floats dtype:', floats.dtype)
mixed = ints + floats
print('mixed dtype (upcast):', mixed.dtype)
wide = np.array([1.0], dtype=np.float64)
narrow = np.array([1.0], dtype=np.float32)
print('narrow + wide dtype:', (narrow + wide).dtype)
print('float32 precision example:', np.float32(0.1) * 10)

ints dtype: int64
floats dtype: float32
mixed dtype (upcast): float64
narrow + wide dtype: float64
float32 precision example: 1.0


# 1Ô∏è‚É£9Ô∏è‚É£ Quick timing tip (baseline)

In [22]:
import time
arr = np.random.rand(1_000_000)
t0 = time.perf_counter()
arr_sum = arr.sum()
t1 = time.perf_counter()
print('np.sum on 1M floats took', round((t1 - t0)*1000, 3), 'ms')
lst = arr.tolist()
t2 = time.perf_counter()
manual = sum(lst)
t3 = time.perf_counter()
print('Python sum on list took', round((t3 - t2)*1000, 3), 'ms')

np.sum on 1M floats took 0.513 ms
Python sum on list took 7.936 ms


# 2Ô∏è‚É£0Ô∏è‚É£ Quick recap & next steps

- Shapes first: print `.shape` when ops fail to broadcast
- Mind views vs copies when mutating slices
- Use `axis` for reductions and `np.newaxis`/`None` to align shapes
- Set seeds for reproducibility; save to `.npy` when you care about dtype/shape
- Move to pandas for mixed-type tabular wrangling

# 2Ô∏è‚É£1Ô∏è‚É£ Conditional ops: where, clip, select

In [23]:
arr = np.array([-2, -1, 0, 1, 2])
print('where positive -> 1 else 0:', np.where(arr > 0, 1, 0))
print('clipped to [-1,1]:', np.clip(arr, -1, 1))
grades = np.array([92, 67, 81])
conds = [grades >= 90, grades >= 70]
choices = ['A', 'B']
print('letter grades:', np.select(conds, choices, default='C'))

where positive -> 1 else 0: [0 0 0 1 1]
clipped to [-1,1]: [-1 -1  0  1  1]
letter grades: ['A' 'C' 'B']


# 2Ô∏è‚É£2Ô∏è‚É£ Reshape & flatten cheat sheet

In [24]:
r = np.arange(6)
print('reshape to 2x3:', r.reshape(2, 3))
print('ravel (view when possible):', r.ravel())
print('flatten (always copy):', r.flatten())
print('add axis as column:', r[:, None].shape)
print('squeeze removes length-1 axes:', np.zeros((1, 3, 1)).squeeze().shape)

reshape to 2x3: [[0 1 2]
 [3 4 5]]
ravel (view when possible): [0 1 2 3 4 5]
flatten (always copy): [0 1 2 3 4 5]
add axis as column: (6, 1)
squeeze removes length-1 axes: (3,)


# 2Ô∏è‚É£3Ô∏è‚É£ Memory footprint quick check

In [25]:
small = np.ones((100, 100), dtype=np.float32)
big = np.ones((100, 100), dtype=np.float64)
print('float32 bytes:', small.nbytes)
print('float64 bytes:', big.nbytes)
ints8 = np.ones((100, 100), dtype=np.int8)
print('int8 bytes:', ints8.nbytes)

float32 bytes: 40000
float64 bytes: 80000
int8 bytes: 10000
