# **Python for Data Analysis**

## **NumPy Basics: Arrays and Vectorized Computation**

In [1]:
import numpy as np

In [6]:
# Generate some random data
data = np.random.randn(2, 3)
print(data)

[[-0.06255002  0.70685834  0.57192944]
 [ 0.46268851  1.37521481 -0.49903599]]


In [7]:
data_10 = data + 10
print(data_10)

[[ 9.93744998 10.70685834 10.57192944]
 [10.46268851 11.37521481  9.50096401]]


In [8]:
data_10 = data - 10
print(data_10)

[[-10.06255002  -9.29314166  -9.42807056]
 [ -9.53731149  -8.62478519 -10.49903599]]


In [3]:
data_10 = data * 10
print(data_10)

[[  8.48642568   3.51227859   6.8717405 ]
 [-13.47046034   1.89349987  15.56131513]]


In [9]:
data_10 = data / 10
print(data_10)

[[-0.006255    0.07068583  0.05719294]
 [ 0.04626885  0.13752148 -0.0499036 ]]


In [10]:
data_10 = data // 10
print(data_10)

[[-1.  0.  0.]
 [ 0.  0. -1.]]


In [11]:
data_10 = data % 10
print(data_10)

[[9.93744998 0.70685834 0.57192944]
 [0.46268851 1.37521481 9.50096401]]


In [12]:
data_10 = data ** 10
print(data_10)

[[9.16800220e-13 3.11403771e-02 3.74476466e-03]
 [4.49663041e-04 2.41938732e+01 9.57896675e-04]]


In [13]:
type(data)

numpy.ndarray

In [15]:
data.shape

(2, 3)

In [16]:
data.dtype

dtype('float64')

In [17]:
# Creating ndarrays

data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
print(arr1)

[6.  7.5 8.  0.  1. ]


In [18]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
print(arr2)

[[1 2 3 4]
 [5 6 7 8]]


In [20]:
print(arr1.dtype)
print(arr2.dtype)

float64
int64


In [21]:
print(arr1.ndim)
print(arr2.ndim)

1
2


In [22]:
print(arr1.shape)
print(arr2.shape)

(5,)
(2, 4)


In [23]:
arr3 = np.zeros(10)
print(arr3)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
arr3 = np.zeros((3, 6))
print(arr3)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [26]:
arr4 = np.ones((3))
print(arr4)

[1. 1. 1.]


In [27]:
arr4 = np.ones((3,6))
print(arr4)

[[1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1.]]


In [28]:
arr4 = np.ones_like((3))
print(arr4)

1


In [29]:
arr4 = np.ones_like((3,6))
print(arr4)

[1 1]


In [33]:
arr4 = np.empty((3))
print(arr4)

[1. 1. 1.]


In [34]:
arr4 = np.empty((3,7))
print(arr4)

[[2.41907520e-312 1.10343781e-312 1.29441743e-312 2.33419537e-312
  9.76118064e-313 2.33419537e-312 2.44029516e-312]
 [2.29175545e-312 2.27053550e-312 8.48798317e-313 1.08221785e-312
  1.14587773e-312 8.70018274e-313 2.37663529e-312]
 [2.22809558e-312 2.46151512e-312 2.05833592e-312 2.41907520e-312
  8.70018275e-313 8.34443015e-308 3.91792476e-317]]


In [40]:
# array(): Convert a list, tuple, or other sequence into a NumPy ndarray
arr1 = np.array([1, 2, 3])  
arr2 = np.array((4, 5, 6), dtype=float)  # Explicitly setting dtype to float

print(arr1, arr1.dtype)
print(arr2, arr2.dtype)

[1 2 3] int64
[4. 5. 6.] float64


In [39]:
# asarray(): Converts to ndarray but avoids copying if already ndarray
arr3 = np.asarray(arr1)  # No new array is created (same reference)
arr4 = np.asarray([[1, 2, 3], [4, 5, 6]])  # Converts list of lists to ndarray

print(arr3, arr3.dtype)
print(arr4, arr4.dtype)

[1 2 3] int64
[[1 2 3]
 [4 5 6]] int64


In [41]:
# arange(): Create an array with a range of values  (start, stop, step)
arr5 = np.arange(0, 10, 2) # Start at 0, stop at 10, step by 2
print(arr5)

[0 2 4 6 8]


In [42]:
arr6 = np.ones((2, 3))  # 2x3 matrix of 1s
arr7 = np.ones_like(arr5)  # Create same shape as arr5 but filled with 1s

print(arr6)
print(arr7)

[[1. 1. 1.]
 [1. 1. 1.]]
[1 1 1 1 1]


In [44]:
# zeros(), zeros_like(): Create an array of 0s
arr8 = np.zeros((3, 2))  # 3x2 matrix of 0s
arr9 = np.zeros_like(arr5)  # Same shape as arr5, filled with 0s

print(arr8)
print(arr9)

[[0. 0.]
 [0. 0.]
 [0. 0.]]
[0 0 0 0 0]


In [45]:
# empty(), empty_like(): Allocate memory without initializing values
arr10 = np.empty((2, 2))  # Contains random uninitialized values
arr11 = np.empty_like(arr5)  # Same shape as arr5, uninitialized values

print(arr10)
print(arr11)

[[6.  7.5]
 [8.  1. ]]
[0 0 0 0 0]


In [46]:
# full(), full_like(): Create an array filled with a specific value
arr12 = np.full((2, 2), 7)  # 2x2 matrix filled with 7s
arr13 = np.full_like(arr5, 3)  # Same shape as arr5, filled with 3s

print(arr12)
print(arr13)

[[7 7]
 [7 7]]
[3 3 3 3 3]


In [47]:
# eye(), identity(): Create an identity matrix (diagonal 1s, others 0s)
arr14 = np.eye(3)  # 3x3 identity matrix
arr15 = np.identity(4)  # 4x4 identity matrix

print(arr14)
print(arr15)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [48]:
# dtype: Data type object
arr1 = np.array([1, 2, 3], dtype=np.float64)  # Creates an array with float64 data type
arr2 = np.array([1, 2, 3], dtype=np.int32)    # Creates an array with int32 data type

print(arr1.dtype)  # Output: float64
print(arr2.dtype)  # Output: int32


float64
int32


In [49]:
arr = np.array([1, 2, 3, 4, 5])  # Creating an integer array
print(arr.dtype)  # Output: int64

float_arr = arr.astype(np.float64)  # Converting to float64
print(float_arr.dtype)  # Output: float64


int64
float64


In [56]:
import numpy as np

# Creating an array of numeric strings
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.bytes_)

# Converting the string array to a float array
float_arr = numeric_strings.astype(float)

print(numeric_strings.dtype)
print(float_arr.dtype)  
# Output: array([ 1.25, -9.6 , 42. ])


|S4
float64


In [58]:
float_arr = float_arr.astype(np.int32)  # Store the new array
print(float_arr.dtype)  # Now it should print int32

int32


# **Basic Indexing and Slicing**

In [59]:
arr = np.arange(10)

# Slicing from index 5 to 8 (exclusive)
print(arr[5:8])

[5 6 7]


In [60]:
# update values in the array
arr[5:8] = 12
print(arr)

[ 0  1  2  3  4 12 12 12  8  9]


In [61]:
arr_1 = arr[5:8]
print(arr_1)

[12 12 12]


In [62]:
# update values in the array
arr[5:] = 15
print(arr)

[ 0  1  2  3  4 15 15 15 15 15]


In [63]:
# update values in the array
arr[:8] = 17
print(arr)

[17 17 17 17 17 17 17 17 15 15]


In [64]:
# update values in the array
arr[:] = 5
print(arr)

[5 5 5 5 5 5 5 5 5 5]


In [65]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(arr2d)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [72]:
print(arr2d[2])
print(arr2d[2][1])

[7 8 9]
8


In [75]:
print(arr2d[2])
print(arr2d[2,1])

[7 8 9]
8


In [82]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])

print(arr3d)

[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]


In [96]:
arr3d.shape

(2, 2, 3)

In [83]:
arr3d[0]  # Output: First 2x3 array

array([[1, 2, 3],
       [4, 5, 6]])

In [84]:
arr3d[1]

array([[ 7,  8,  9],
       [10, 11, 12]])

In [85]:
arr3d[0][1]

array([4, 5, 6])

In [101]:
arr3d[0:1 , 1:2]

array([[[4, 5, 6]]])

In [106]:
arr3d[0 , 1:2]

array([[4, 5, 6]])

In [81]:
arr3d_new = arr3d[0].copy()  # Creates a copy of the first 2x3 array
print(arr3d_new)

[[1 2 3]
 [4 5 6]]


## **1D, 2D, 3D array slicing and indexing**

- 1D Slicing: [start : end : step]
- 2D Slicing: [row_start : row_end, col_start : col_end]
- 3D Slicing: [depth, row, col]

In [107]:
# 1D array slicing

arr1d = np.array([10, 20, 30, 40, 50])

print(arr1d[1:4])   # [20 30 40] (Elements from index 1 to 3)
print(arr1d[:3])    # [10 20 30] (First 3 elements)
print(arr1d[2:])    # [30 40 50] (From index 2 to end)
print(arr1d[::2])   # [10 30 50] (Every 2nd element)
print(arr1d[1::2])  # [20 40] (Every 2nd element starting from index 1)

[20 30 40]
[10 20 30]
[30 40 50]
[10 30 50]
[20 40]


In [None]:
# 2D array slicing

arr2d = np.array([
    [1, 2, 3], 
    [4, 5, 6], 
    [7, 8, 9]
])

print(arr2d[:2, 1:])   #  (First 2 rows, columns 1 to end)

print(arr2d[1:, :2])   #  (Rows 1 to end, first 2 columns)

print(arr2d[:, ::2])   #  (All rows, every 2nd column)


[[2 3]
 [5 6]]
[[4 5]
 [7 8]]
[[1 3]
 [4 6]
 [7 9]]


In [110]:
# 3D array slicing

arr3d = np.array([
    [[1, 2, 3], [4, 5, 6]],
    [[7, 8, 9], [10, 11, 12]]
])

print(arr3d[0, :, :])   # First 2D matrix

print(arr3d[:, 1, :])   # Second row from both 2D slices

print(arr3d[:, :, 1])   # Second column from every matrix


[[1 2 3]
 [4 5 6]]
[[ 4  5  6]
 [10 11 12]]
[[ 2  5]
 [ 8 11]]


## **Boolean Indexing**

In [None]:
# 1D array boolean indexing

arr = np.array([10, 20, 30, 40, 50])

# Get elements greater than 25
bool_mask = arr > 25
print(bool_mask)  

filtered_arr = arr[bool_mask]
print(filtered_arr) 

[False False  True  True  True]
[30 40 50]


In [114]:
# 2D array boolean indexing

arr2d = np.array([
    [1, 2, 3], 
    [4, 5, 6], 
    [7, 8, 9]
])

# Select elements greater than 4
filtered_2d = arr2d[arr2d > 4]
print(filtered_2d)  

[5 6 7 8 9]


In [None]:
# Using Boolean Indexing with Conditions

# Get all even numbers
even_numbers = arr[arr % 2 == 0]
print(even_numbers)  

# Get values between 20 and 40
between_20_40 = arr[(arr >= 20) & (arr <= 40)]
print(between_20_40)  

[10 20 30 40 50]
[20 30 40]


In [None]:
# Boolean Indexing with np.where()

# Replace values greater than 25 with 99
new_arr = np.where(arr > 25, 99, arr)
print(new_arr)  

[10 20 99 99 99]


In [124]:
# Creating an array of names (row labels)
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])

# Creating a random dataset (7 rows, 4 columns)
np.random.seed(42)  # Ensures reproducibility
data = np.random.randn(7, 4)  # Normally distributed random numbers

print(f'names.shape: {names.shape}')
print(f'data.shape: {data.shape}')

print("Names:\n", names)
print("\nData:\n", data)

# Boolean mask for 'Bob'
bob_mask = names == 'Bob'
print("Bob Mask:\n", bob_mask)

# Select all rows where name is 'Bob'
bob_rows = data[bob_mask]
print("\nRows where name is 'Bob':\n", bob_rows)



names.shape: (7,)
data.shape: (7, 4)
Names:
 ['Bob' 'Joe' 'Will' 'Bob' 'Will' 'Joe' 'Joe']

Data:
 [[ 0.49671415 -0.1382643   0.64768854  1.52302986]
 [-0.23415337 -0.23413696  1.57921282  0.76743473]
 [-0.46947439  0.54256004 -0.46341769 -0.46572975]
 [ 0.24196227 -1.91328024 -1.72491783 -0.56228753]
 [-1.01283112  0.31424733 -0.90802408 -1.4123037 ]
 [ 1.46564877 -0.2257763   0.0675282  -1.42474819]
 [-0.54438272  0.11092259 -1.15099358  0.37569802]]
Bob Mask:
 [ True False False  True False False False]

Rows where name is 'Bob':
 [[ 0.49671415 -0.1382643   0.64768854  1.52302986]
 [ 0.24196227 -1.91328024 -1.72491783 -0.56228753]]


In [118]:
# Selecting only the last two columns for 'Bob'
bob_columns = data[names == 'Bob', 2:]
print("\nBob's last two columns:\n", bob_columns)

# Selecting only column index 3 for 'Bob'
bob_col_3 = data[names == 'Bob', 3]
print("\nBob's 3rd column:\n", bob_col_3)



Bob's last two columns:
 [[ 0.64768854  1.52302986]
 [-1.72491783 -0.56228753]]

Bob's 3rd column:
 [ 1.52302986 -0.56228753]


In [119]:
# Using negation (~) to get all rows except 'Bob'
not_bob_rows = data[~(names == 'Bob')]
print("\nRows where name is NOT 'Bob':\n", not_bob_rows)



Rows where name is NOT 'Bob':
 [[-0.23415337 -0.23413696  1.57921282  0.76743473]
 [-0.46947439  0.54256004 -0.46341769 -0.46572975]
 [-1.01283112  0.31424733 -0.90802408 -1.4123037 ]
 [ 1.46564877 -0.2257763   0.0675282  -1.42474819]
 [-0.54438272  0.11092259 -1.15099358  0.37569802]]


In [120]:
# Using OR (|) to select 'Bob' or 'Will'
mask = (names == 'Bob') | (names == 'Will')
selected_rows = data[mask]
print("\nRows where name is 'Bob' OR 'Will':\n", selected_rows)


Rows where name is 'Bob' OR 'Will':
 [[ 0.49671415 -0.1382643   0.64768854  1.52302986]
 [-0.46947439  0.54256004 -0.46341769 -0.46572975]
 [ 0.24196227 -1.91328024 -1.72491783 -0.56228753]
 [-1.01283112  0.31424733 -0.90802408 -1.4123037 ]]


In [125]:
# Setting all negative values to 0
data[data < 0] = 0
print("\nData after replacing negatives with 0:\n", data)

# Setting entire rows to 7 if name is NOT 'Joe'
data[names != 'Joe'] = 7
print("\nData after setting rows (not 'Joe') to 7:\n", data)


Data after replacing negatives with 0:
 [[0.49671415 0.         0.64768854 1.52302986]
 [0.         0.         1.57921282 0.76743473]
 [0.         0.54256004 0.         0.        ]
 [0.24196227 0.         0.         0.        ]
 [0.         0.31424733 0.         0.        ]
 [1.46564877 0.         0.0675282  0.        ]
 [0.         0.11092259 0.         0.37569802]]

Data after setting rows (not 'Joe') to 7:
 [[7.         7.         7.         7.        ]
 [0.         0.         1.57921282 0.76743473]
 [7.         7.         7.         7.        ]
 [7.         7.         7.         7.        ]
 [7.         7.         7.         7.        ]
 [1.46564877 0.         0.0675282  0.        ]
 [0.         0.11092259 0.         0.37569802]]


In [126]:
# Creating a 2D NumPy array
np.random.seed(42)  # Ensuring reproducibility
data = np.random.randint(10, 100, (5, 5))  # 5x5 array with values between 10 and 99

print("Original Data:\n", data)

# Selecting rows at index positions 0, 2, and 4
row_indices = [0, 2, 4]
selected_rows = data[row_indices]

print("\nSelected Rows (indices 0, 2, 4):\n", selected_rows)

Original Data:
 [[61 24 81 70 30]
 [92 96 84 84 97]
 [33 12 31 62 11]
 [97 39 47 11 73]
 [69 30 42 85 67]]

Selected Rows (indices 0, 2, 4):
 [[61 24 81 70 30]
 [33 12 31 62 11]
 [69 30 42 85 67]]


In [127]:
# Selecting columns at index positions 1 and 3
column_indices = [1, 3]
selected_columns = data[:, column_indices]

print("\nSelected Columns (indices 1, 3):\n", selected_columns)


Selected Columns (indices 1, 3):
 [[24 70]
 [96 84]
 [12 62]
 [39 11]
 [30 85]]


In [129]:
# Selecting elements at specific row-column pairs
selected_elements = data[[0, 1, 2], [3, 0, 4]]
print("\nSelected Elements:\n", selected_elements)


Selected Elements:
 [70 92 11]


In [128]:
# Selecting elements at specific row-column pairs
row_indices = [0, 1, 2]
col_indices = [3, 0, 4]

selected_elements = data[row_indices, col_indices]
print("\nSelected Elements:\n", selected_elements)


Selected Elements:
 [70 92 11]


In [130]:
# Selecting the last two rows
selected_last_rows = data[[-2, -1]]

print("\nSelected Last Two Rows:\n", selected_last_rows)


Selected Last Two Rows:
 [[97 39 47 11 73]
 [69 30 42 85 67]]


In [131]:
arr = np.arange(32).reshape((8, 4))
print(arr)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]
 [24 25 26 27]
 [28 29 30 31]]


In [132]:
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

# **Transpose**

In [133]:
import numpy as np

# Creating a 2D array
arr = np.array([[1, 2, 3], 
                [4, 5, 6]])

print("Original Array:\n", arr)

# Transposing the array
arr_T = arr.T
print("\nTransposed Array:\n", arr_T)

Original Array:
 [[1 2 3]
 [4 5 6]]

Transposed Array:
 [[1 4]
 [2 5]
 [3 6]]


In [137]:
arr_3d = np.arange(12).reshape(2, 3, 2)
print(arr_3d)
print("Original Shape:", arr_3d.shape)

# Transposing the array
arr_3d_T = arr_3d.T
print(arr_3d_T)
print("Transposed Shape:", arr_3d_T.shape)

[[[ 0  1]
  [ 2  3]
  [ 4  5]]

 [[ 6  7]
  [ 8  9]
  [10 11]]]
Original Shape: (2, 3, 2)
[[[ 0  6]
  [ 2  8]
  [ 4 10]]

 [[ 1  7]
  [ 3  9]
  [ 5 11]]]
Transposed Shape: (2, 3, 2)


In [139]:
# swapaxes
arr = np.array([[1, 2, 3], 
                [4, 5, 6]])

print("Original Array:\n", arr)

# Swap axis 0 (rows) with axis 1 (columns)
swapped = arr.swapaxes(0, 1)
print("\nSwapped Axes Array:\n", swapped)


Original Array:
 [[1 2 3]
 [4 5 6]]

Swapped Axes Array:
 [[1 4]
 [2 5]
 [3 6]]


In [147]:
# swapaxes in 3D
arr_3d = np.array([[[0, 1], [2, 3], [4, 5]], 
                    [[6, 7], [8, 9], [10, 11]]])

print("\nOriginal Shape:\n", arr_3d)

# Swap axis 0 and 1
swapped = arr_3d.swapaxes(0, 1)

print("\nSwapped Shape:\n", swapped)


Original Shape:
 [[[ 0  1]
  [ 2  3]
  [ 4  5]]

 [[ 6  7]
  [ 8  9]
  [10 11]]]

Swapped Shape:
 [[[ 0  1]
  [ 6  7]]

 [[ 2  3]
  [ 8  9]]

 [[ 4  5]
  [10 11]]]


In [148]:
arr = np.random.randn(6, 3)
print(arr)
np.dot(arr.T, arr)

[[-2.61254901  0.95036968  0.81644508]
 [-1.523876   -0.42804606 -0.74240684]
 [-0.7033438  -2.13962066 -0.62947496]
 [ 0.59772047  2.55948803  0.39423302]
 [ 0.12221917 -0.51543566 -0.60025385]
 [ 0.94743982  0.291034   -0.63555974]]


array([[10.9121524 ,  1.41689014, -0.9988055 ],
       [ 1.41689014, 12.56575621,  3.57400405],
       [-0.9988055 ,  3.57400405,  2.53364975]])

In [153]:
arr = np.array([
    [[ 0,  1],  [ 2,  3],  [ 4,  5]],  
    [[ 6,  7],  [ 8,  9],  [10, 11]]  
])

print("Original Shape:\n", arr)  # (2, 3, 2)

# Applying transpose
transposed = arr.transpose((1, 0, 2))
print("Transposed Shape:\n", transposed)

Original Shape:
 [[[ 0  1]
  [ 2  3]
  [ 4  5]]

 [[ 6  7]
  [ 8  9]
  [10 11]]]
Transposed Shape:
 [[[ 0  1]
  [ 6  7]]

 [[ 2  3]
  [ 8  9]]

 [[ 4  5]
  [10 11]]]


# **Universal Functions: Fast Element-Wise Array Functions**

In [163]:
arr1 = np.arange(10)
arr2 = np.arange(10, 20)
print(arr)
print(arr2)

[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]


In [155]:
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [156]:
np.square(arr)

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])

In [157]:
np.exp(arr)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])

In [165]:
arr1.dot(arr2)

np.int64(735)

In [166]:
np.maximum(arr1, arr2)

array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [167]:
arr = np.random.randn(7) * 5
print(arr)

[-5.10776097 -0.80877693 -2.66824402 -0.02763931 -1.14725227  1.94674457
 -6.32559557]


In [None]:
# np.modf() splits floating-point numbers into their fractional and integer parts.

remainder, whole_part = np.modf(arr)
print("Remainder:\n", remainder)
print("\nWhole Part:\n", whole_part)

Remainder:
 [-0.10776097 -0.80877693 -0.66824402 -0.02763931 -0.14725227  0.94674457
 -0.32559557]

Whole Part:
 [-5. -0. -2. -0. -1.  1. -6.]


In [1]:
import pandas as pd

# Load JSON file
df = pd.read_json("processed_test.json")

# Convert to Excel
df.to_excel("processed_test.xlsx", index=False)