# Datasets_and_benchmarks Group
### Collect existing datasets and benchmarks:
- Images Classification: MNIST,CIFAR,ImageNet
- Image Segmentation: VOC2012-2014
- Videos data: Kinetics 400, UCF101
- Neural data: NLB, Sensorium, MTNeuro, TUH, HCP

### Steps to pre-process data:
- Data cleaning: Clean and remove any inconsistent or missing data
- Data normalization: Normalize the data so that all features have the same scale and units.
- Feature extraction: Extract meaningful features from the raw data
- Data augmentation
- Data splitting: Split the data into training, validation, and test sets

### Publish new datasets and benchmarks: (NeurIPS guidelines)
- Formatting: data format and read/write tools
- Documentation: instructions to use the data and associated tools
- Maintenance: how the datasets are archived and updated over time
- Ethics: biases in the data, personal identifiable information within the data
- Licensing: how the data can be distributed for research/commercial purposes


In [1]:
## import all packages here
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.impute import IterativeImputer, KNNImputer
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchvision
from torchvision.utils import make_grid

In [15]:
class load_data:
    def __init__(self, key, categray=None):
            self.key = key
            self.categray = categray
    def collect_data(self):
            if self.key == 'MNIST':
                transform = transforms.Compose([transforms.ToTensor(),
                                                transforms.Normalize((0.1307,), (0.3081,))])
                train_dataset = datasets.MNIST(root='../data/MNIST', train=False, download=True,
                                       transform=transform)
                test_dataset = datasets.MNIST(root='../data/MNIST', train=False,
                                      transform=transform)
            if self.key == 'cifar':
                transform = transforms.Compose([transforms.ToTensor(),
                                                transforms.Normalize((0.5, 0.5, 0.5), 
                                                                     (0.5, 0.5, 0.5))])
                trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
                testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
            return train_dataset, test_dataset

In [16]:
class pre_process:
    def __init__(self, train_dataset, test_dataset, pre_process_list):
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.pre_process_list = pre_process_list
    def pre_process_function(self):
        "reseize, crop, normalize, or augment images"
        trainloader = torch.utils.data.DataLoader(train_dataset,batch_size=bs, shuffle=True)
        testloader = torch.utils.data.DataLoader(test_dataset,batch_size=bs, shuffle=True)
        return trainloader, testloader
    
    def show_single_class(self, dl,label):
        dataiter = iter(dl)
        imgs, lbls = dataiter.next()
        for i in range(100):  # show just the frogs
            if lbls[i] == label:  # 6 = frog
                self.imshow(torchvision.utils.make_grid(imgs[i]))
    
    def imshow(self):
        img = img / 2 + 0.5   # unnormalize
        npimg = img.numpy()   # convert from tensor
        plt.imshow(np.transpose(npimg, (1, 2, 0))) 
        plt.show()
    
    def show_batch(sefl, dl):
        # DL: DATA Loader
        for images, labels in dl:
            fig, ax = plt.subplots(figsize=(12, 6))
            ax.set_xticks([]); ax.set_yticks([])
            ax.imshow(make_grid(images, nrow=16).permute(1, 2, 0))
            break

    def missing_data_proc(self, df, type='mean', del_cols = []):
        """Process missing data.
        Arguments - df - input dataset as a dataframe,
        type - The type of missing data imputation to be done,
        del_cols - columns to be deleted, if type == 'Delete Columns'."""

        if type == 'mean':
            df = df.fillna(df.mean())
        elif type == 'median':
            df = df.fillna(df.median())
        elif type == 'mode':
            df = df.fillna(df.mode())
        elif type == 'Delete Columns':
            df.drop(del_cols, axis=1)
        elif type == 'MICE':
            mice_imputer = IterativeImputer()
            imputed_data = mice_imputer.fit_transform(df)
            df = pd.DataFrame(imputed_data, columns=df.columns)
        elif type == 'KNN':
            knn_imputer = KNNImputer(n_neighbors=5)
            imputed_data = knn_imputer.fit_transform(df)
            df = pd.DataFrame(imputed_data, columns=df.columns)
        else:
            print("Please enter a valid type of Missing data processing.")
        
        return df
        


        
   
    