[Reference](https://blog.stackademic.com/numpy-in-real-world-data-science-projects-abfb517507e1)

# Handling Missing Data


In [2]:
import numpy as np

In [3]:
# Creating an array with missing values
data = np.array([1, 2, np.nan, 4, 5])

# Checking for missing values
has_missing = np.isnan(data)

# Filling missing values with a specific value or the mean
data[has_missing] = 0  # Replace with a specific value
# Alternatively:
# data[has_missing] = np.nanmean(data)

print(data)

[1. 2. 0. 4. 5.]


In [4]:
# create an array with missing data
arr = np.array([1, 2, np.nan, 4, 5])

# fill missing data with zero
arr_filled = np.nan_to_num(arr)

print(arr_filled)

[1. 2. 0. 4. 5.]


In [5]:
# create a 2D array with missing data
arr = np.array([[1, 2, np.nan], [4, np.nan, 6], [7, 8, 9]])

# delete rows with missing data
arr_no_missing = np.delete(arr, np.where(np.isnan(arr).any(axis=1)), axis=0)

print(arr_no_missing)

[[7. 8. 9.]]


In [6]:
# create an array with missing data
arr = np.array([1, 2, np.nan, 4, 5])

# impute missing data with mean
arr_imputed = np.where(np.isnan(arr), np.mean(arr[~np.isnan(arr)]), arr)

print(arr_imputed)

[1. 2. 3. 4. 5.]


In [7]:
# Creating an array with outliers
data = np.array([1, 2, 8, 4, 5, 15])

# Identifying outliers based on z-score
z_scores = np.abs((data - np.mean(data)) / np.std(data))
is_outlier = z_scores > 2  # Adjust the threshold as needed

# Removing outliers
cleaned_data = data[~is_outlier]

print(cleaned_data)

[ 1  2  8  4  5 15]


In [8]:
# create an array with outliers
arr = np.array([1, 2, 3, 100, 5, 6])

# clip values outside the range of the 1st and 99th percentiles
arr_clipped = np.clip(arr, np.percentile(arr, 1), np.percentile(arr, 99))

print(arr_clipped)

[ 1.05  2.    3.   95.3   5.    6.  ]


In [9]:
# create an array with outliers
arr = np.array([1, 2, 3, 100, 5, 6])

# calculate mean and standard deviation
mean = np.mean(arr)
std = np.std(arr)

# calculate z-scores
z_scores = np.abs((arr - mean) / std)

# remove values with z-scores greater than 3
arr_filtered = arr[z_scores <= 3]

print(arr_filtered)

[  1   2   3 100   5   6]


In [10]:
# create an array with outliers
arr = np.array([1, 2, 3, 100, 5, 6])

# calculate median
median = np.median(arr)

# replace values outside the range of the 1st and 99th percentiles with the median
arr_filtered = np.where(np.logical_or(arr < np.percentile(arr, 1), arr > np.percentile(arr, 99)), median, arr)

print(arr_filtered)

[4. 2. 3. 4. 5. 6.]


# NumPy for Machine Learning

In [11]:
from sklearn.linear_model import LinearRegression

# create a NumPy array
arr = np.array([[1], [2], [3], [4], [5]])

# create a linear regression model
model = LinearRegression()

# train the model
model.fit(arr, np.array([2, 4, 6, 8, 10]))

# make a prediction
prediction = model.predict(np.array([[6]]))

print(prediction)

[12.]


In [13]:
import tensorflow as tf

# create a NumPy array
arr = np.array([[1, 2, 3], [4, 5, 6]])

# create a TensorFlow tensor
tensor = tf.constant(arr)

# perform matrix multiplication
result = tf.matmul(tensor, tensor)

# print the result
print(result)

In [15]:
import keras.backend as K

# create a NumPy array
arr = np.array([[1, 2, 3], [4, 5, 6]])

# create a Keras tensor
tensor = K.constant(arr)

# perform matrix multiplication
result = K.dot(tensor, tensor)

# print the result
print(result)