# This is a systematic guide to data preprocessing for machine learning.

## Load the Data

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "D:\\nur\\Machine Learning\\Codes\\demo_data\\titanic\\train.csv"  # Replace with actual file path
data = pd.read_csv(file_path)

# Display basic info
print(data.head())  # Show the first few rows
print(data.info())  # Check for missing values and data types

## Handle Missing Data

In [None]:
# Option 1: Drop rows with missing values (only if few missing values)
data = data.dropna()

# Option 2: Fill missing values with the mean or the statistical parameters (for numerical columns)
data.fillna(data.mean(), inplace=True)
# data.fillna(data.max(), inplace=True)

# Option 3: Fill missing values with the mode (for categorical columns)
for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)


## Handle Duplicates

In [None]:
# Remove duplicate rows
data = data.drop_duplicates()

## Feature Engineering

In [None]:
# Move the Label Column to the End (to ensure every dataset's ground truth is in the end)
label_col = "target"  # Change to your actual label column name
cols = [col for col in data.columns if col != label_col] + [label_col]
data = data[cols]

In [None]:
# Drop Unnecessary Features
drop_columns = ["unnecessary_column1", "unnecessary_column2"]  # Replace with actual column names
data = data.drop(columns=drop_columns)

In [None]:
# One-Hot Encoding for Categorical Variables
categorical_columns = ["category1", "category2"]  # Replace with actual categorical columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [None]:
# Feature Scaling (Normalization or Standardization)

# Normalization (Min-Max Scaling, range [0,1])
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_columns = ["feature1", "feature2"]  # Replace with actual numerical feature names
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])


# Standardization (Mean = 0, Std = 1)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

## Split Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = data.iloc[:, :-1]  # All columns except the last one (features)
y = data.iloc[:, -1]   # Last column (target variable)

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to NumPy arrays (optional for PyTorch processing)
X_train, X_test, y_train, y_test = map(np.array, [X_train, X_test, y_train, y_test])

# For X_train, you can also spilit into X_train && X_val

## Convert Data to PyTorch Dataset (for Neural Networks)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom PyTorch dataset class
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)  # Ensure y is a column vector

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

# Create training and testing datasets
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
