#Data Operations
- Here is a collection of the different data operations that we perform on the different kinds of datasets using List, Pandas, Numpy & Scikit-learn.

## Data preparation and cleaning
- Here we are cleaning the unwanted None values and then applying min - max scaling on the data to normalize the data values between 0 to 1.

- Min - Max Scaling:

  Normal_Value = (value - min(value)) / (max(value) - min(value))

In [7]:
## Implementation using a List
# Defining a dataset using a vanilla list
list_data = [
    [5.0, 3.2, None],
    [4.8, None, 2.1],
    [None, 3.6, 2.4]
]

# This is a complex list comphension where it is excluding the None values, sums the values column wise and finds the average of the each row
means = [
    sum(filter(None, [row[i] for row in list_data])) / len([row[i] for row in list_data if row[i] is not None])
    for i in range(len(list_data[0]))
]

cleaned_data = [
    [row[i] if row[i] is not None else means[i] for i in range(len(row))]
    for row in list_data
]

#Here traversing column wise on the transpose of the cleaned_data to pull out the minimun and maximun for each row
min_values = [min(col) for col in zip(*cleaned_data)]
max_values = [max(col) for col in zip(*cleaned_data)]

# Normalizing the dataset for optimized operations
normalized_data = [
    [(val - min_values[i]) / (max_values[i] - min_values[i]) for i, val in enumerate(row)]
    for row in cleaned_data
]

print("Normalized Data:", normalized_data)

Normalized Data: [[1.0, 0.0, 0.5], [0.0, 0.5000000000000006, 0.0], [0.5000000000000022, 1.0, 1.0]]


In [8]:
## Implementation using a Numpy array
# Defining a dataset using a Numpy array
import numpy as np

np_data = np.array([
    [5.0, 3.2, np.nan],
    [4.8, np.nan, 2.1],
    [np.nan, 3.6, 2.4]
])

# Replace NaNs with column means
col_means = np.nanmean(np_data, axis=0)
data = np.where(np.isnan(np_data), col_means, np_data)

# Normalize using Min-Max scaling
data_min = data.min(axis=0)
data_max = data.max(axis=0)
normalized_data = (data - data_min) / (data_max - data_min)

print("Normalized Data:", normalized_data)

Normalized Data: [[1.  0.  0.5]
 [0.  0.5 0. ]
 [0.5 1.  1. ]]


In [9]:
## Implementation using a Pandas dataframe
# Defining a dataset using a Pandas dataframe
import pandas as pd

# Convert data to Pandas DataFrame
pd_data = pd.DataFrame({
    "Feature1": [5.0, 4.8, None],
    "Feature2": [3.2, None, 3.6],
    "Feature3": [None, 2.1, 2.4]
})

# Fill missing values with column means
pd_data.fillna(pd_data.mean(), inplace=True)

# Normalize using Min-Max scaling
normalized_data = (data - data.min()) / (data.max() - data.min())

print("Normalized Data:\n", normalized_data)

Normalized Data:
 [[1.         0.37931034 0.05172414]
 [0.93103448 0.44827586 0.        ]
 [0.96551724 0.51724138 0.10344828]]


## Feature Engineering
- Encode categorical data into numerical data. Using Encoders
- Scale numerical features into managable values. Using Scaler

In [10]:
## Implementing using a numpy array
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Categorical data
categories = np.array(["low", "medium", "high", "medium", "low"])

# Encode categories to integers
le = LabelEncoder()
encoded_categories = le.fit_transform(categories)

# Numerical data
numerical_data = np.array([[5.5, 2.3], [6.1, 3.4], [4.8, 1.2]])

# Standardize features
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(numerical_data)

print("Encoded Categories:", encoded_categories)
print("Scaled Numerical Data:\n", scaled_numerical)

Encoded Categories: [1 2 0 2 1]
Scaled Numerical Data:
 [[ 0.06274558  0.        ]
 [ 1.19216603  1.22474487]
 [-1.25491161 -1.22474487]]


In [11]:
# Categorical and numerical data
pdf_data = pd.DataFrame({
    "Category": ["low", "medium", "high", "medium", "low"],
    "Feature1": [5.5, 6.1, 4.8, 7.0, 5.2],
    "Feature2": [2.3, 3.4, 1.2, 2.8, 2.5]
})

# Encode categories
pdf_data["Category"] = pdf_data["Category"].astype("category").cat.codes

# Scale numerical features
pdf_data[["Feature1", "Feature2"]] = (pdf_data[["Feature1", "Feature2"]] - pdf_data[["Feature1", "Feature2"]].mean()) / pdf_data[["Feature1", "Feature2"]].std()

print("Transformed Data:\n", pdf_data)


Transformed Data:
    Category  Feature1  Feature2
0         1 -0.256265 -0.173249
1         2  0.442639  1.187995
2         0 -1.071653 -1.534494
3         2  1.490995  0.445498
4         1 -0.605717  0.074250


## Sparse Data handling

In [12]:
from scipy.sparse import csr_matrix

# Create a sparse matrix
dense_matrix = np.array([
    [0, 0, 3],
    [0, 0, 0],
    [7, 0, 0]
])
sparse_matrix = csr_matrix(dense_matrix)

# Operations on sparse matrix
print("Sparse Matrix:\n", sparse_matrix)
print("Dense Representation:\n", sparse_matrix.toarray())


Sparse Matrix:
   (0, 2)	3
  (2, 0)	7
Dense Representation:
 [[0 0 3]
 [0 0 0]
 [7 0 0]]


## Model input preparation

In [13]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Train Features Shape:", X_train.shape)
print("Test Labels Shape:", y_test.shape)


Train Features Shape: (120, 4)
Test Labels Shape: (30,)


## Handling Multi-Dimensional Data with Tensors

In [None]:
import torch

# Simulating random image data: Batch of 3 images, RGB channels, 64x64 resolution
images = torch.rand(3, 3, 64, 64)

# Checking dimensions
print("Shape of Images Tensor:", images.shape)

# Normalize pixel values (0-1 range)
# Min - Max Scaling
normalized_images = (images - images.min()) / (images.max() - images.min())

# Reshape for model input (if needed)
reshaped_images = normalized_images.view(-1, 3, 64, 64)


## Data Pipeline with Pandas and NumPy
-> Load Data -> Handle missing values -> Enocode catagorical features -> Scale numerical features -> Split Dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np


# Load dataset
pipe_data = pd.DataFrame({
    "Category": ["low", "medium", "high", "medium", "low", "high"],
    "Feature1": [5.5, 6.1, np.nan, 7.0, 5.2, 6.8],
    "Feature2": [2.3, 3.4, 1.2, np.nan, 2.5, 3.1],
    "Target": [0, 1, 0, 1, 0, 1]
})

# 1. Handle missing values
pipe_data.fillna(pipe_data.mean(), inplace=True)

# 2. Encode categorical features
le = LabelEncoder()
pipe_data["Category"] = le.fit_transform(pipe_data["Category"])

# 3. Scale numerical features
scaler = StandardScaler()
data[["Feature1", "Feature2"]] = scaler.fit_transform(data[["Feature1", "Feature2"]])

# 4. Prepare features and labels
X = data[["Category", "Feature1", "Feature2"]].values
y = data["Target"].values

# 5. Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Training Features:\n", X_train)
print("Training Labels:\n", y_train)