In [2]:
import numpy as np

# ✅ **NumPy Cheatsheet for ML Engineers**

### **1. Setup**

```python
import numpy as np
```

---

### **2. Array Creation**

```python
np.array([1, 2, 3])            # From list or tuple
np.zeros((3, 4))               # 3x4 array of zeros
np.ones((2, 2))                # 2x2 array of ones
np.eye(3)                      # Identity matrix (3x3)
np.full((2, 3), 7)             # Filled with a constant
np.arange(0, 10, 2)            # [0, 2, 4, 6, 8]
np.linspace(0, 1, 5)           # [0. , 0.25, ..., 1.]
np.random.rand(2, 3)           # Uniform [0, 1)
np.random.randn(2, 3)          # Standard normal
np.random.randint(0, 10, (2,3))# Random ints in range
```

---

### **3. Array Properties**

```python
a.shape        # Shape (rows, cols)
a.ndim         # Number of dimensions
a.size         # Total number of elements
a.dtype        # Data type
a.itemsize     # Size in bytes of each element
```

---

### **4. Indexing & Slicing**

```python
a[0]           # First element
a[-1]          # Last element
a[1:4]         # Slice elements 1 to 3
a[:, 0]        # All rows, 1st column
a[1, :]        # 2nd row, all columns
a[::2]         # Step slicing
```

---

### **5. Boolean & Fancy Indexing**

```python
a[a > 0]               # Filter positive values
a[(a > 1) & (a < 5)]   # Combine conditions
np.where(a > 0, 1, 0)  # Ternary condition
```

---

### **6. Reshaping & Transforming**

```python
a.reshape(2, 3)        # New shape (same size)
a.ravel()              # Flatten
a.flatten()            # Copy as 1D array
a.T                    # Transpose
a.squeeze()            # Remove singleton dims
a.expand_dims(a, axis=0) # Add a dimension
```

---

### **7. Math & Stats**

```python
np.mean(a)             # Mean
np.median(a)           # Median
np.std(a)              # Standard deviation
np.var(a)              # Variance
np.sum(a)              # Sum
np.min(a), np.max(a)   # Min, Max
np.argmax(a), np.argmin(a)  # Index of max/min
np.cumsum(a)           # Cumulative sum
np.diff(a)             # Discrete difference
```

---

### **8. Arithmetic & Broadcasting**

```python
a + b                  # Elementwise addition
a - b
a * b
a / b
a ** 2                 # Square elements
np.exp(a)              # e^a
np.log(a)              # Natural log
```

---

### **9. Linear Algebra**

```python
np.dot(a, b)           # Dot product
a @ b                  # Matrix multiplication
np.matmul(a, b)        # Same as @
np.linalg.inv(a)       # Inverse
np.linalg.det(a)       # Determinant
np.linalg.eig(a)       # Eigenvalues/vectors
np.trace(a)            # Sum of diagonal
np.linalg.norm(a)      # Vector norm
```

---

### **10. Combining & Splitting**

```python
np.concatenate([a, b], axis=0)
np.vstack([a, b])      # Stack vertically
np.hstack([a, b])      # Stack horizontally
np.split(a, 2)
np.array_split(a, 3)
```

---

### **11. Random Sampling**

```python
np.random.seed(42)     # Reproducibility
np.random.choice(a)    # Random element
np.random.shuffle(a)   # Shuffle array
```

---

### **12. Missing Values & NaNs**

```python
np.isnan(a)            # Check for NaNs
np.nan_to_num(a)       # Replace NaNs with 0
np.nanmean(a)          # Mean ignoring NaNs
```

---

### **13. Useful Utilities**

```python
np.unique(a)           # Unique values
np.sort(a)             # Sort array
np.argsort(a)          # Indices for sort
np.clip(a, min, max)   # Limit values
np.allclose(a, b)      # Compare arrays
```

---


# ✅ **NumPy Interview Problems**

### **1. Find the Missing Number**

Given a NumPy array with `n` distinct integers from `1` to `n+1` with one number missing, find the missing number.

In [None]:
import numpy as np
def find_missing(nums: list):
    expected_sum = np.sum(np.arange(1, len(nums) + 2, 1))
    actual_sum = np.sum(nums)
    missing = expected_sum - actual_sum
    return int(missing)

# Doesn't handle edge cases (e.g., empty list) gracefully.
# np.sum() used on Python list — efficient, but a bit overkill unless you're
# working with large arrays or already using NumPy elsewhere.

# Improved code:
def find_missing(nums):
    if not nums:
        return None
    n = len(nums) + 1
    expected_sum = n * (n + 1) // 2
    actual_sum = sum(nums)
    return expected_sum - actual_sum

print(find_missing([1, 2, 3, 4, 6, 7, 8]))
print(find_missing([]))


### **2. Normalize a Feature Vector**

Given a NumPy array of shape `(n,)`, normalize it to have a mean of 0 and standard deviation of 1.


In [None]:
def normalize(input_arr:list):
    mean = np.mean(input_arr)
    stddev = np.sqrt(np.var(input_arr))
    output = []
    for n in input_arr:
        output.append(int((n - mean) / stddev))
    return output

# int() in normalization step leads to loss of precision.
# Should use vectorized NumPy operations for cleaner code.

# Imporved code:
def normalize(input_arr):
    arr = np.array(input_arr)
    return (arr - arr.mean()) / arr.std() if arr.std() != 0 else np.zeros_like(arr)

print(normalize([5, 10, 3]))


### **3. Matrix Row with Max Sum**

Given a 2D array, return the index of the row with the maximum sum.



In [None]:
# Assumption: matrix is square
def max_index(matrix):
    row, col = len(matrix), len(matrix[0])
    output = []
    for i in range(row):
        row_sum = 0
        for j in range(col):
            row_sum += matrix[i][j]
        output.append(int(row_sum))
    return f"max index is: {int(np.argmax(output))} and the max value is {int(np.max(output))}"

# Input is a list of lists, not a NumPy array.
# Not efficient: np.sum(matrix, axis=1) would be better.
# Returns a formatted string instead of index/value separately.

# Improved code:
def max_index(matrix):
    matrix = np.array(matrix)
    row_sums = matrix.sum(axis=1)
    idx = row_sums.argmax()
    return idx, row_sums[idx]

matrix = [[2, 3, 4], [4, 5, 6], [7, 8, 9]]
print(max_index(matrix))

matrix = [[2, 3, 4, 5], [3, 6, 7, 8], [9, 3, 2, 1], [3, 5, 2, 7], [2, 1, 2, 1]]
print(max_index(matrix))

### **4. One-Hot Encoding**

Given an array of class labels like `[0, 2, 1, 3]`, convert to one-hot encoding using NumPy.



In [None]:
def one_hot_encoding(labels: list):
    row = len(labels)
    col = max(labels) + 1
    matrix = np.zeros((row, col))
    for l in range(len(labels)):
        matrix[l][labels[l]] = 1
    return matrix

# Improved code:
def one_hot_encoding(labels):
    labels = np.array(labels)
    n_classes = labels.max() + 1
    one_hot = np.zeros((len(labels), n_classes), dtype=int)
    one_hot[np.arange(len(labels)), labels] = 1
    return one_hot

print(one_hot_encoding(labels=[0, 2, 1, 3]))
print(one_hot_encoding(labels=[1, 1, 2, 5, 7, 4]))

### **5. Detect Outliers Using IQR**

Given a 1D array, detect all elements that are outliers using the IQR method.


**The Interquartile Range (IQR)** method is a statistical technique used to identify outliers in a dataset. It involves calculating the IQR, which is the range between the first quartile (Q1) and the third quartile (Q3), and then defining upper and lower fences based on 1.5 times the IQR. Data points falling outside these fences are considered outliers.

Here's a step-by-step breakdown:

1. Calculate the quartiles:
- Sort the data in ascending order.
- Find the median (Q2), which is the middle value.
- Q1 is the median of the lower half of the data (excluding Q2).
- Q3 is the median of the upper half of the data (excluding Q2).
2. Calculate the IQR:
- IQR = Q3 - Q1.
3. Determine the outlier fences:
- Lower Fence: Q1 - (1.5 * IQR).
- Upper Fence: Q3 + (1.5 * IQR).
4. Identify outliers:
Any data point below the lower fence or above the upper fence is considered an outlier.

In [None]:
def find_outliers(arr):
    q1 = np.quantile(arr, 0.25)
    q3 = np.quantile(arr, 0.75)
    median = np.median(arr)

    iqr = q3 - q1
    lower_fence = q1 - (1.5*iqr)
    upper_fence = q3 + (1.5*iqr)
    outliers = [i for i in arr if i < lower_fence or i > upper_fence]
    return outliers

arr = [1, 5, 7, 8, 10, 12, 15, 18, 20, 22, 25, 28, 45]
print(find_outliers(arr))

arr = [-11, 3, 5, 7, 9, 3, 7, 4, 90]
print(find_outliers(arr))

### **6. Implement a Softmax Function**

Implement the softmax activation function using NumPy.




In [None]:
def softmax(arr):
    activations = []
    denominator = np.sum([np.exp(i) for i in arr])
    activations = [float(np.exp(i)/ denominator) for i in arr]
    return activations

# Uses np.exp directly, which can cause overflow for large numbers.
# Not vectorized.

def softmax(arr):
    arr = np.array(arr)
    exp_values = np.exp(arr - np.max(arr))  # numerical stability
    return exp_values / exp_values.sum()

print(softmax([5, 7, 10]))

### **7. Flatten and Reconstruct a Matrix**

Flatten a 2D matrix into a 1D vector and reshape it back to original shape. Verify that reconstruction is correct.


In [None]:
def reshape_array(arr):
    arr = np.array(arr)
    arr_flat = arr.ravel()
    arr_reshaped = arr_flat.reshape(len(arr), len(arr[0]))
    return arr_flat, arr_reshaped

# Improved code:
def reshape_array(arr):
    arr = np.array(arr)
    arr_flat = arr.ravel()
    arr_reshaped = arr_flat.reshape(arr.shape)
    return arr_flat, arr_reshaped

print(reshape_array(arr=[[1, 2, 3], [6, 7, 8]]))


### **8. Compute Pairwise Euclidean Distance**

Given two sets of vectors `A` and `B`, compute the full pairwise distance matrix without using loops.

In [None]:
def compute_distance(vec_a, vec_b):
    d = np.sqrt(np.power((vec_b - vec_a), 2))
    return d

a = np.array((1, 4, 5, 8))
b = np.array((3, 5, 7, 9))
print(compute_distance(a, b))

# Computes element-wise difference, not full Euclidean distance.
# Incorrect if goal is scalar distance between two vectors.

# Correct code:
def compute_distance(vec_a, vec_b):
    return np.linalg.norm(vec_b - vec_a)

print(compute_distance(a, b))


### **9. Shuffle Rows Independently**

Given a 2D array, shuffle each row **independently**.


In [None]:
def shuffle_row_content(matrix):
    rows = len(matrix)
    cols = len(matrix[0])
    output = []
    for r in range(rows):
        row = []
        for c in range(cols):
            row.append(int(matrix[r][c]))
            np.random.shuffle(row)
        output.append(row)
    return output

# Logic is broken: it shuffles row incrementally inside a column loop.
# np.random.shuffle(row) should happen outside the inner loop.

# Imporved code:
def shuffle_row_content(matrix):
    matrix = np.array(matrix)
    return np.array([np.random.permutation(row) for row in matrix])

arr_2 = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]])

print(shuffle_row_content(arr_2))

### **10. Apply Custom Function Across Columns**

Given a 2D array, apply a custom function (e.g., `range = max - min`) to each column.

In [None]:
def custom_function(matrix):
    rows, cols = len(matrix), len(matrix[0])
    output = []
    for i in range(cols):
        col = matrix[:, i]
        output.append(float(np.max(col) - np.min(col)))
    return output

arr = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]])

# Redundant conversion from list to array.
# Better to vectorize using axis=0.

# Imporved code:
def custom_function(matrix):
    matrix = np.array(matrix)
    return (matrix.max(axis=0) - matrix.min(axis=0)).tolist()

print(custom_function(arr))

### **11. Sum the Alphabet Values**
Given a list of strings made up of lowercase or uppercase letters from a to z, return a list of the alphabet sum of each word.

The alphabet sum is defined as the sum of the ordinal position of each letter in the English alphabet.
For example, a = 1, b = 2, ..., z = 26.
So "sport" has an alphabet sum of 19 + 16 + 15 + 18 + 20 = 88.



In [None]:
def sum_alphabet(words):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    alphabet_dict = {}
    for i in range(1, len(alphabet)+1):
        alphabet_dict[alphabet[i-1]] = i

    output = []
    for word in words:
        sum_value = 0
        word.strip().lower()
        for letter in word:
            if letter.lower() in alphabet_dict:
                sum_value += alphabet_dict[letter.lower()]
        output.append(sum_value)
    return output

# word.strip().lower() has no effect since it's not reassigned.
# The try block is unnecessary and incorrectly used (except doesn’t handle anything properly).
# Minor inefficiencies in repeatedly calling letter.lower() inside the loop.

# Improved code:
def sum_alphabet(words):
    alphabet_dict = {char: idx + 1 for idx, char in enumerate('abcdefghijklmnopqrstuvwxyz')}
    output = []
    for word in words:
        word = word.strip().lower()
        sum_value = sum(alphabet_dict.get(ch, 0) for ch in word)
        output.append(sum_value)
    return output

words = ["sport", "Good", "bAd", " ", "%cat"]
print(sum_alphabet(words))

# ✅ **Review: pratical questions**


### **1. What’s the difference between `reshape()`, `ravel()`, and `flatten()`?**

* When do you use each?
* What’s the key difference in terms of **copy vs view**?

In [6]:
import numpy as np

arr = np.array([[1, 2, 3], [4, 5, 6]])

# reshape() – changes shape, returns a new view if possible
reshaped = arr.reshape(3, 2)

# flatten() – always returns a copy
flattened = arr.flatten()

# ravel() – returns a flattened view when possible (faster)
raveled = arr.ravel()

print("Reshaped:\n", reshaped)
print("Flattened:", flattened)
print("Raveled:", raveled)


Reshaped:
 [[1 2]
 [3 4]
 [5 6]]
Flattened: [1 2 3 4 5 6]
Raveled: [1 2 3 4 5 6]


###  **2. What is broadcasting in NumPy?**

* How does NumPy handle operations between arrays of different shapes?
* What’s the result of:

  ```python
  a = np.array([[1], [2], [3]])
  b = np.array([10, 20, 30])
  a + b
  ```

In [7]:
a = np.array([[1], [2], [3]])    # Shape: (3, 1)
b = np.array([10, 20, 30])       # Shape: (3,)

result = a + b

print("Broadcasted Result:\n", result)
# Output:
# [[11 21 31]
#  [12 22 32]
#  [13 23 33]]


Broadcasted Result:
 [[11 21 31]
 [12 22 32]
 [13 23 33]]


### **3. How can you filter elements from an array based on a condition?**

* Example:

  ```python
  arr = np.array([5, 10, 15, 20])
  # Keep only values > 10
  ```

In [9]:
arr = np.array([5, 10, 15, 20])

# Boolean indexing
filtered = arr[arr > 10]  # Keep only values > 10

# Using np.where for binary condition
binary_mask = np.where(arr > 10, 1, 0)

print("Filtered:", filtered)     # [15 20]
print("Binary mask:", binary_mask)  # [0 0 1 1]

Filtered: [15 20]
Binary mask: [0 0 1 1]


### **4. How do you compute row-wise and column-wise statistics in a matrix?**

* What's the difference between:

  ```python
  matrix.sum(axis=0) vs matrix.sum(axis=1)
  ```

In [10]:
matrix = np.array([[1, 2, 3],
                   [4, 5, 6]])

row_sums = matrix.sum(axis=1)    # Sums across columns → row totals
col_sums = matrix.sum(axis=0)    # Sums across rows → column totals

print("Row sums:", row_sums)     # [ 6 15]
print("Column sums:", col_sums)  # [5 7 9]

Row sums: [ 6 15]
Column sums: [5 7 9]


### **5. How do you generate reproducible random numbers in NumPy?**

* What is the purpose of `np.random.seed()`?
* What's the difference between `np.random.rand()`, `randn()`, and `choice()`?

In [11]:
np.random.seed(42)  # Ensures the same random output each time

# Random float values in [0, 1)
rand_vals = np.random.rand(3)      # Uniform

# Random normal distribution (mean 0, std 1)
randn_vals = np.random.randn(3)

# Random choice from a list without replacement
choices = np.random.choice([1, 2, 3, 4], size=2, replace=False)

print("rand:", rand_vals)
print("randn:", randn_vals)
print("choice:", choices)

rand: [0.37454012 0.95071431 0.73199394]
randn: [-1.11188012  0.31890218  0.27904129]
choice: [1 2]


### **6. What’s the difference between `np.array_split()` and `np.split()`?**

* When would `np.split()` raise an error?
* What’s the behavior of `np.array_split(np.array([1, 2, 3, 4, 5]), 3)`?

In [12]:
import numpy as np

arr = np.array([1, 2, 3, 4, 5])

# np.split() only works if the array can be evenly divided
# np.split(arr, 3)  # ❌ This will raise an error: ValueError

# np.array_split() allows unequal splits
splits = np.array_split(arr, 3)
print(splits)  # [array([1, 2]), array([3, 4]), array([5])]


[array([1, 2]), array([3, 4]), array([5])]


### **7. How do you compute the quantiles (like Q1 and Q3) of an array?**

* How would you use that to identify outliers using the IQR method?

In [14]:
arr = np.array([5, 7, 8, 10, 15, 18, 20, 22, 25, 28, 45, 87])

q1 = np.quantile(arr, 0.25)
q3 = np.quantile(arr, 0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = arr[(arr < lower_bound) | (arr > upper_bound)]

print("Q1:", q1)
print("Q3:", q3)
print("Outliers:", outliers)

Q1: 9.5
Q3: 25.75
Outliers: [87]


### **8. How do you perform matrix multiplication in NumPy?**

* What’s the difference between `@`, `np.dot()`, and `np.matmul()`?
* When should you prefer one over the other?

> Use @ for readability in ML models.

> All are functionally the same for 2D arrays. np.matmul() works with batches too.

In [15]:
A = np.array([[1, 2],
              [3, 4]])

B = np.array([[5, 6],
              [7, 8]])

# Three equivalent ways to do matrix multiplication
print("Using @:\n", A @ B)
print("Using dot():\n", np.dot(A, B))
print("Using matmul():\n", np.matmul(A, B))

Using @:
 [[19 22]
 [43 50]]
Using dot():
 [[19 22]
 [43 50]]
Using matmul():
 [[19 22]
 [43 50]]


### **9. How can you shuffle the rows of a 2D array?**

* And how would you shuffle each row **independently**?

In [16]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Shuffle rows (in-place on a copy)
shuffled = np.copy(arr)
np.random.shuffle(shuffled)
print("Shuffled rows:\n", shuffled)

# Shuffle contents of each row independently
independently_shuffled = np.array([np.random.permutation(row) for row in arr])
print("Each row shuffled:\n", independently_shuffled)

Shuffled rows:
 [[7 8 9]
 [4 5 6]
 [1 2 3]]
Each row shuffled:
 [[1 3 2]
 [4 6 5]
 [8 9 7]]


### **10. How can you create a one-hot encoded matrix from a list of integer labels?**

* Input: `[0, 2, 1]`
* Output:

  ```
  [[1, 0, 0],
   [0, 0, 1],
   [0, 1, 0]]
  ```





In [None]:
labels = np.array([0, 2, 1])
n_classes = labels.max() + 1  # Automatically infers number of classes

one_hot = np.zeros((len(labels), n_classes), dtype=int) # Shape: (number of samples, number of classes)

#one_hot[rows, columns] = 1
one_hot[np.arange(len(labels)), labels] = 1
# np.arange(len(labels)) picks the row indices
# labels gives the column indices
# Together, they point to the "1" positions in the one-hot array.

print(one_hot)
# Output:
# [[1 0 0]
#  [0 0 1]
#  [0 1 0]]


**What is `np.arange()`?**

`np.arange()` is a NumPy function that returns an array with evenly spaced values within a **given range** — just like Python’s built-in `range()`, but as a NumPy array.

**Syntax**

```python
np.arange([start,] stop[, step], dtype=None)
```

* `start`: Starting value (default = 0)
* `stop`: **End value (excluded)**
* `step`: Difference between consecutive values (default = 1)
* `dtype`: Optional, specify data type (e.g., `int`, `float`)

**Examples**

```python
np.arange(5)                    # [0 1 2 3 4]
np.arange(2, 7)                 # [2 3 4 5 6]
np.arange(1, 10, 2)             # [1 3 5 7 9]
np.arange(0.5, 2.5, 0.5)        # [0.5 1.  1.5 2. ]
np.arange(10, 0, -2)            # [10  8  6  4  2]
```

**Use Case in One-Hot Encoding**

Let’s say:

```python
labels = np.array([0, 2, 1])
```

We want to create this one-hot encoded matrix:

```
[[1, 0, 0],
 [0, 0, 1],
 [0, 1, 0]]
```

To fill the **rows 0, 1, 2**, we need the row indices:

```python
np.arange(len(labels))  →  [0, 1, 2]
```

These are just `[0, 1, 2]`, the **row numbers** in the output matrix.

Paired with:

```python
labels = [0, 2, 1]  # column numbers
```

We now assign the value `1` to each corresponding `[row, col]` position:

```python
one_hot[np.arange(len(labels)), labels] = 1
```
**Summary**

`np.arange()` is:

* Like Python’s `range()` but returns a NumPy array
* Used to generate **row indices**, **time steps**, **batch indices**, etc.
* Key for vectorized indexing patterns like one-hot encoding


# ✅ **Numpy for ML Engineering**


## **1. Vectorization and Broadcasting**

**Concept:**

Vectorization is replacing explicit Python loops with NumPy operations that run in compiled code for better performance. Broadcasting lets NumPy perform operations on arrays of different shapes.

**Why It Matters:**

* Speeds up code dramatically
* Reduces memory footprint
* Leads to cleaner, more readable code

**Usage Tips:**

* Use `np.where`, `np.sum`, `np.mean`, `np.dot`, etc. instead of manual loops.
* Understand shape rules of broadcasting.

**Example:**

```python
# BAD: Loop-based
output = []
for x in arr:
    output.append(x ** 2)

# GOOD: Vectorized
output = arr ** 2
```

**Broadcasting:**

```python
# Broadcasting a scalar
arr = np.array([1, 2, 3])
arr + 5  # [6, 7, 8]

# Broadcasting a row vector across rows
matrix = np.array([[1, 2, 3], [4, 5, 6]])
row_mean = matrix.mean(axis=1, keepdims=True)
normalized = matrix - row_mean
```

## **2. Numerical Stability**

**Concept:**

Certain mathematical operations (like softmax) can cause **overflow or underflow**. Overflow means a number is too large to be represented (e.g., `np.exp(1000)` → `inf`). Underflow means a number is too small (close to zero), and might be rounded down to `0.0`.

**Why It Matters:**

Unstable code may give `inf`, `NaN`, or misleading values in real ML pipelines.

**Best Practices:**

* Subtract max before applying `exp` in softmax.
* Use log-space tricks (`logsumexp`, etc.)

**Example:**

```python
def stable_softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
```

## **3. Defensive Programming**

**Concept:**

Write functions that don't crash on edge cases like empty arrays, NaNs, or zero-division.

**Why It Matters:**

In production or research, you can't afford silent failures.

**Best Practices:**

* Check array length before division.
* Use `np.isnan`, `np.isinf`, `np.any`, `np.all` to inspect array health.
* Return default values where appropriate.

**Example:**

```python
def normalize(x):
    x = np.array(x)
    if x.std() == 0:
        return np.zeros_like(x)
    return (x - x.mean()) / x.std()
```

## **4. Efficient Indexing and Masking**

**Concept:**

Use Boolean indexing to select or manipulate data without loops. Boolean indexing means using a boolean array (of `True`/`False` values) to select elements of another array. The elements corresponding to `True` are kept.

**Why It Matters:**

Masking makes filtering data simple and fast.

**Usage Tips:**

* Combine masks using `&`, `|` with parentheses.
* Use `np.where` for ternary conditions.

**Example:**

```python
arr = np.array([1, 5, 7, 10])
mask = arr > 5
arr[mask]  # [7, 10]

np.where(arr > 5, 1, 0)  # [0, 0, 1, 1]
```

## **5. Array Manipulation Mastery**

**Concept:**

Know when and how to reshape, flatten, expand or squeeze dimensions.

**Why It Matters:**

ML input pipelines often require reshaped or batched data.

**Tips:**

* Use `.reshape()`, `.flatten()`, `.squeeze()`, `np.expand_dims()`
* Avoid reshaping that breaks data order

**Example:**

```python
x = np.array([[1, 2], [3, 4]])
x.reshape(-1)         # [1 2 3 4]
x.reshape(1, 4)        # [[1 2 3 4]]
x.flatten()           # [1 2 3 4] (copy)
x.ravel()             # view (if possible)
```

## 6. Advanced Aggregations and Stats

**Concept:**

Use axis-based operations and aggregation functions effectively.

**Why It Matters:**

Summarizing data across rows or columns is key for ML feature engineering.

**Tips:**

* Learn `np.sum`, `np.mean`, `np.std`, `np.median`, `np.quantile`, `np.argmax`, `np.bincount`
* Use `axis=` parameter wisely

**Example:**

```python
matrix = np.array([[1, 2, 3], [4, 5, 6]])
matrix.mean(axis=0)  # mean per column
matrix.sum(axis=1)   # sum per row
```

## **7. Randomness and Reproducibility**

**Concept:**

Use NumPy's `random` module for consistent random sampling.

**Why It Matters:**

In experiments, reproducibility is essential.

**Tips:**

* Use `np.random.seed()` before randomness
* Use `np.random.permutation`, `shuffle`, `choice`, `randn`, `randint`

**Example:**

```python
np.random.seed(42)
np.random.choice([1, 2, 3], size=2, replace=False)
```

# ✅ **Unit Tests**

**1. Use `unittest` or `pytest`**

You can write tests using Python’s built-in `unittest`, or the more flexible `pytest`. Both support NumPy.


**2. Use NumPy's Assertion Helpers**

Instead of `assertEqual()` or `==`, use:

* `np.testing.assert_array_equal()` – for exact match
* `np.testing.assert_allclose()` – for floating-point arrays (with tolerance)
* `np.testing.assert_array_almost_equal()` – deprecated, but still used in legacy tests

**Example 1: Using `unittest`**

```python
import unittest
import numpy as np
from your_module import normalize_features

class TestNormalizeFeatures(unittest.TestCase):
    def test_normalization(self):
        x = np.array([[1, 2, 3], [4, 5, 6]])
        normalized = normalize_features(x)

        # Check mean ≈ 0, std ≈ 1 per row
        means = normalized.mean(axis=1)
        stds = normalized.std(axis=1)
        
        np.testing.assert_allclose(means, 0, atol=1e-7)
        np.testing.assert_allclose(stds, 1, atol=1e-7)

if __name__ == '__main__':
    unittest.main()
```

**Example 2: Using `pytest`**

Create a file `test_processing.py`:

```python
import numpy as np
from your_module import sum_alphabet

def test_sum_alphabet():
    words = ["abc", "Good", "", "%a"]
    expected = [6, 41, 0, 1]
    result = sum_alphabet(words)

    np.testing.assert_array_equal(result, expected)
```

Then run:

```bash
pytest test_processing.py
```

**Summary of NumPy Assertion Functions**

| Function                          | Use When…                                  |
| --------------------------------- | ------------------------------------------ |
| `assert_array_equal(a, b)`        | Exact equality                             |
| `assert_allclose(a, b, atol=...)` | Floating-point match within tolerance      |
| `assert_array_less(a, b)`         | Each element in `a` < corresponding in `b` |
| `assert_raises`                   | Expecting an error                         |


**Tips**

* Always test **edge cases**: empty arrays, zeros, all identical values, etc.
* Use `keepdims=True` and `axis` carefully — test shape too!
* Add tests for invalid inputs (e.g., shape mismatch)


# ✅**Interview-style practice problem**

### **1. Normalize, Filter, and One-Hot Encode**

You are given a list of numeric feature vectors and a corresponding list of integer class labels.

**Task:**

Write a function `process_data(features, labels)` that:

1. **Normalizes** each feature vector to have zero mean and unit standard deviation.
2. **Filters out** any vectors whose mean (before normalization) is less than a given threshold.
3. Returns:

   * The **normalized and filtered feature matrix**
   * The **one-hot encoded labels** for the rows that passed the filter

Inputs

* `features`: A 2D list or NumPy array of shape `(n_samples, n_features)`
* `labels`: A 1D list or NumPy array of length `n_samples`
* `threshold`: A float indicating the minimum required mean value of the original feature vector

Example:

```python
features = [
    [1, 2, 3],
    [10, 10, 10],
    [0, 0, 1],
    [-5, -3, -1]
]

labels = [0, 2, 1, 1]
threshold = 3.0
```


In [None]:
import numpy as np

def process_data(features, labels, threshold = 0):
    # Convert inputs to Numpy arrays
    features = np.array(features)
    labels = np.array(labels)

    # Step 1: Compute row-wise mean of original features
    row_means = features.mean(axis=1)

    # Step 2: Filter rows where mean >= threshold
    mask = row_means >= threshold
    filtered_features = features[mask]
    filtered_labels = labels[mask]

    # Step 3: Normalize each row independently
    # (x - mean) / std, row_wise
    means = filtered_features.mean(axis=1, keepdims=True)
    stds = filtered_features.std(axis=1, keepdims=True)

    # Avoid division by zero
    stds[stds == 0] = 1
    normalized_features = (filtered_features - means) / stds

    # 4. One-hot encode the filtered labels
    n_classes = filtered_labels.max() + 1
    one_hot = np.zeros((len(filtered_labels), n_classes), dtype=int)
    one_hot[np.arange(len(filtered_labels)), filtered_labels] = 1

    return normalized_features, one_hot

features = [
    [1, 2, 3],        # mean = 2.0
    [10, 10, 10],     # mean = 10.0
    [0, 0, 1],        # mean ≈ 0.33
    [-5, -3, -1]      # mean = -3.0
]

labels = [0, 2, 1, 1]
threshold = 3.0

X_norm, y_onehot = process_data(features, labels, threshold)

print("Normalized Features:\n", X_norm)
print("One-Hot Labels:\n", y_onehot)


Normalized Features:
 [[0. 0. 0.]]
One-Hot Labels:
 [[0 0 1]]


| Use Case                                             | Normalize **Row-wise** | Normalize **Column-wise** |
| ---------------------------------------------------- | ---------------------- | ------------------------- |
| Each sample is independent and scaled differently    | ✅                      | ❌                         |
| You’re preparing features for ML model input         | ❌                      | ✅                         |
| You’re filtering based on a sample-level condition   | ✅                      | ❌                         |
| You want consistent feature influence across samples | ❌                      | ✅                         |


Let’s unpack why we used this:

```python
means = filtered_features.mean(axis=1, keepdims=True)
stds  = filtered_features.std(axis=1, keepdims=True)
```

instead of:

```python
means = np.mean(filtered_features)
stds  = np.std(filtered_features)
```

The Key Difference: *Axis Matters*

**1. `np.mean(filtered_features)`**

Computes **the mean over all elements** in the 2D array (scalar output).
This would normalize the entire dataset globally.

**2. `filtered_features.mean(axis=1, keepdims=True)`**

Computes **the mean across each row**, returning a 2D array shaped `(n_samples, 1)` — one mean per sample.


**Why `axis=1` and `keepdims=True`?**

| Parameter       | Purpose                                                                                             |
| --------------- | --------------------------------------------------------------------------------------------------- |
| `axis=1`        | Tells NumPy: “calculate along the columns of each row”                                              |
| `keepdims=True` | Keeps the result as a **column vector**, so it **broadcasts correctly** in subtraction and division |

Without `keepdims=True`, you'd get a shape mismatch:

```python
(3, 3) - (3,)  # This would work, but it's broadcasted row-wise as a 1D
(3, 3) - (3, 1)  # This is safer and clearer
```

**Summary**

| Expression                      | Meaning                         | Shape                                     |
| ------------------------------- | ------------------------------- | ----------------------------------------- |
| `np.mean(x)`                    | Mean of **all elements**        | Scalar                                    |
| `x.mean(axis=0)`                | Mean of each column (feature)   | (n\_features,)                            |
| `x.mean(axis=1)`                | Mean of each row (sample)       | (n\_samples,)                             |
| `x.mean(axis=1, keepdims=True)` | Mean of each row, **as column** | (n\_samples, 1) ✅ useful for broadcasting |



**Why is this line used?**

```python
stds[stds == 0] = 1
```

Because in normalization:

```python
normalized = (x - mean) / std
```

If `std == 0`, you would get:

```python
(0 / 0) → NaN
```

or worse:

```python
x / 0 → ∞ or crash
```

**Why replace 0 with 1?**

It’s a **safe default**:

* If a sample has **no variation** (e.g. `[10, 10, 10]`), its std = 0.
* Subtracting the mean gives `[0, 0, 0]`
* Dividing by 1 still gives `[0, 0, 0]`

This preserves the fact that all elements are identical **without introducing NaN or Inf.**

Example:

```python
x = np.array([[10, 10, 10],
              [1, 2, 3]])

mean = x.mean(axis=1, keepdims=True)
std = x.std(axis=1, keepdims=True)

# Defensive patch
std[std == 0] = 1

z = (x - mean) / std
print(z)
```

Output:

```
[[ 0.          0.          0.        ]
 [-1.22474487  0.          1.22474487]]
```

---

**Summary:**

| Case                       | Why it happens                   | Fix                   |
| -------------------------- | -------------------------------- | --------------------- |
| Standard deviation is zero | All values are equal             | Replace std with 1    |
| Why 1?                     | Keeps normalized output as zeros | Avoids division error |