In [1]:
#Data Generation and Processing:
 #  - Generate a 2D data array `data` of shape (1000, 800), where the values follow a normal distribution with a mean of -1 and a standard deviation of 1.5.
 #  - Randomly set approximately 10% of the data points to NaN.
# - Print the shape of the original data `data` to confirm the dimensions are correct.
#  - Print an overview of the data after setting NaNs (e.g., use `np.isnan(data).sum()` to print the number of NaNs).\

import numpy as np

mu, sigma = -1, 1.5 # mean and standard deviation
data = np.random.normal(mu, sigma, 800000)
data = data.reshape((1000,800))

# choosing random indexes to put NaN
index_nan = np.random.choice(data.size, int(((1000*800)/10)), replace=False)
 
# adding nan to the data.
data.ravel()[index_nan] = np.nan

print("Original Data Shape: " + str(data.shape))
print("Number of Nan: " + str(np.isnan(data).sum()))





Original Data Shape: (1000, 800)
Number of Nan: 80000


In [2]:
'''2. Data Imputation and Standardization:
   - Impute all NaN values with the mean of the non-NaN values in the dataset.
   - Standardize the imputed data so that the new dataset has a mean close to 0 and a standard deviation close to 1.
   - Print the new mean and standard deviation of the dataset after imputation.
   - Print the mean and standard deviation of the standardized dataset `data_normalized` to confirm it is close to 0 and 1.'''
mean = np.nanmean(data)
data[np.isnan(data)] = mean
print("Mean after imputation: " + str(np.mean(data)))
print("STD after imputation: " +str(np.std(data)))
print()

data -= (np.mean(data,axis=0)-0.001)
data /= np.std(data, axis=0)
print("Mean after standardization: " + str(np.mean(data)))
print("STD afte standardization: " +str(np.std(data)))

Mean after imputation: -1.0035378272649023
STD after imputation: 1.4249103734125055

Mean after standardization: 0.0007027572194882629
STD afte standardization: 1.0000000001394165


In [3]:
'''3. Masking Operation and Data Selection:
   - Create a mask `mask` to select all elements greater than the mean in the standardized data.
   - Use the mask `mask` to select and store all elements greater than the mean from `data_normalized` into `filtered_data`.
   - Print the number of elements selected by the mask.
   - Print the first 10 elements of `filtered_data` to confirm the masking operation is correct.'''

mask = (data > np.mean(data))
filtered_data = data[mask]
print("\nFiltered_data size: " +str(filtered_data.size))
print("\nFirst 10 elemetent: "+str(filtered_data[:10]))



Filtered_data size: 400825

First 10 elemetent: [0.04910563 0.82705679 0.08924769 0.61052292 0.29972401 1.87997847
 0.6111632  1.75240069 0.43215538 1.40466572]


In [4]:
'''4. Data Modification and Shape Change:
   - Multiply all values in `filtered_data` by 2.
   - Set all elements in `filtered_data` that are less than 1 to 0.
   - Get the first 320,000 number, than change the shape of `filtered_data` to (200, 1600).
   - After modifying `filtered_data` (such as multiplying by 2 and setting values less than 1 to 0), print the first 10 modified elements.
   - Print the new shape of `filtered_data` and confirm that it is (200, 1600).'''

filtered_data = filtered_data *2
filtered_data[filtered_data < 1] = 0

filtered_data = filtered_data[0:320000]
filter_data = np.reshape(filtered_data,(200,1600))

a  = np.ravel(filter_data)
print("First 10 elements: \n" +str(a[:10]))
print("\nshape: " +str(filter_data.shape))


First 10 elements: 
[0.         1.65411359 0.         1.22104585 0.         3.75995694
 1.2223264  3.50480137 0.         2.80933145]

shape: (200, 1600)
