In [1]:
import scipy.io as scio
import torch as t
def reader_breast():
    data = scio.loadmat('datasets/breast.mat')['breast'].toarray()
    features, targets = t.Tensor(data[:, :-1]), t.Tensor(data[:, -1])
    targets = t.where(targets != 1, t.zeros_like(targets), targets)
    return features, targets

In [3]:
def reader_australian():
    data = scio.loadmat('datasets/Australian.mat')
    features, targets = data['fea'], data['gnd']
    features, targets = t.Tensor(features.toarray()), t.Tensor(targets).squeeze()
    targets = t.where(targets != 1, t.zeros_like(targets), targets)
    return features, targets

In [125]:
import os
import ssl
import torch
import pandas as pd
import numpy as np
import urllib.request
from pathlib import Path
from joblib import dump,load
from ucimlrepo import fetch_ucirepo
from sklearn.datasets import fetch_20newsgroups,fetch_openml
from sklearn.feature_extraction.text import CountVectorizer

os.environ['http_proxy'] = '127.0.0.1:1066'
os.environ['https_proxy'] = '127.0.0.1:1066'

ssl._create_default_https_context = ssl._create_unverified_context

def _preprocess_data(uci_id):
        # fetch dataset
        dataset = fetch_ucirepo(id=uci_id)
        
        # data (as pandas dataframes)
        X = dataset.data.features
        y = dataset.data.targets
        
        # 检测并删除缺失值
        combined = pd.concat([X, y], axis=1)
        combined_cleaned = combined.dropna()

        # 分离 features 和 targets
        X_cleaned = combined_cleaned.iloc[:, :-1]
        y_cleaned = combined_cleaned.iloc[:, -1]

        # 转换为 PyTorch 张量
        features = torch.Tensor(X_cleaned.to_numpy())
        targets = torch.Tensor(y_cleaned.to_numpy()).flatten()

        return features, targets

def get_uci_dataset_by_id(uci_id):

    match int(uci_id):
        case 15:
            # Breast
            features, targets = _preprocess_data(uci_id)
            targets = torch.where(targets != 4, torch.tensor(0.), torch.tensor(1.)) # 4为恶性 2为良性 改为1,0
            print(f'{features.shape}, p:{torch.sum(targets).item()}, n:{torch.sum(targets != 1).item()}')
            return features, targets
        case 143:
            # Australian
            features, targets = _preprocess_data(uci_id)
            targets = torch.where(targets != 1, torch.tensor(0.), torch.tensor(1.)) # 1 为正类 2为负 改为1,0
            print(f'{features.shape}, p:{torch.sum(targets).item()}, n:{torch.sum(targets != 1).item()}')
            return features, targets
        case 267:
            # banknote
            features, targets = _preprocess_data(uci_id)
            targets = torch.where(targets != 1, torch.tensor(0.), torch.tensor(1.)) # 1 为正类 2为负 改为1,0
            print(f'{features.shape}, p:{torch.sum(targets).item()}, n:{torch.sum(targets != 1).item()}')
            return features, targets
        case 171:
            # Madelon
            pass
        case 327:
            # Phishing
            features, targets = _preprocess_data(uci_id)
            targets = torch.where(targets != 1, torch.tensor(0.), torch.tensor(1.)) # 1 为正类 2为负 改为1,0
            print(f'{features.shape}, p:{torch.sum(targets).item()}, n:{torch.sum(targets != 1).item()}')
            return features, targets
        case 105:
            # Vote
            features, targets = _preprocess_data(uci_id)
            targets = torch.where(targets != 1, torch.tensor(0.), torch.tensor(1.)) # 1 为正类 2为负 改为1,0
            print(f'{features.shape}, p:{torch.sum(targets).item()}, n:{torch.sum(targets != 1).item()}')
            return features, targets
        # case 73:
        #     # fetch dataset
        #     dataset = fetch_ucirepo(id=uci_id)
            
        #     # data (as pandas dataframes)
        #     X = dataset.data.features
        #     y = dataset.data.targets
        #     # 检测并删除缺失值
        #     combined = pd.concat([X, y], axis=1)
        #     combined_cleaned = combined.dropna()
    
        #     # 分离 features 和 targets
        #     X_cleaned = combined_cleaned.iloc[:, :-1]
        #     y_cleaned = combined_cleaned.iloc[:, -1]

        #     # print(X_cleaned)
        #     from sklearn.preprocessing import LabelEncoder
        #     label_encoder_X = LabelEncoder()
        #     X_cleaned = X_cleaned.apply(lambda col: label_encoder_X.fit_transform(col))
            
        #     label_encoder_y = LabelEncoder()
        #     y_cleaned = label_encoder_y.fit_transform(y_cleaned)
    
        #     # 转换为 PyTorch 张量
        #     features = torch.Tensor(X_cleaned.to_numpy())
        #     targets = torch.Tensor(y_cleaned).flatten()
        #     return features, targets

        case 73:
            # fetch dataset
            dataset = fetch_ucirepo(id=uci_id)
            
            # data (as pandas dataframes)
            X = dataset.data.features
            y = dataset.data.targets
            # 检测并删除缺失值
            combined = pd.concat([X, y], axis=1)
            combined_cleaned = combined.dropna()
    
            # 分离 features 和 targets
            X_cleaned = combined_cleaned.iloc[:, :-1]
            y_cleaned = combined_cleaned.iloc[:, -1]

            X_vectorized = pd.get_dummies(X_cleaned)
            
            # 如果 y 是分类变量，也需要转换
            # 比如如果 y 也是 'p' 和 'e' 两种类别
            y_vectorized = pd.get_dummies(y_cleaned)
    
            # 转换为 PyTorch 张量
            features = torch.Tensor(X_vectorized.to_numpy())
            targets = torch.Tensor(y_vectorized.to_numpy())[:,1]
            return features, targets
        
    
        case _:
            # 所有不需要特殊处理的都可以直接转换
            features, targets = _preprocess_data(uci_id)
            targets = torch.where(targets != 1, torch.tensor(0.), torch.tensor(1.)) # 改为1,0 
            print(f'{features.shape}, p:{torch.sum(targets).item()}, n:{torch.sum(targets != 1).item()}')
            return features, targets

def get_news_dataset_by_categories():
    
    pos="comp"
    neg="rec"
    categories = [
    #  'alt.atheism',
     'comp.graphics',
     'comp.os.ms-windows.misc',
     'comp.sys.ibm.pc.hardware',
     'comp.sys.mac.hardware',
     'comp.windows.x',
    #  'misc.forsale',
     'rec.autos',
     'rec.motorcycles',
     'rec.sport.baseball',
     'rec.sport.hockey',
    #  'sci.crypt',
    #  'sci.electronics',
    #  'sci.med',
    #  'sci.space',
    #  'soc.religion.christian',
    #  'talk.politics.guns',
    #  'talk.politics.mideast',
    #  'talk.politics.misc',
    #  'talk.religion.misc'
    ]

    data_home = Path('./')
    file_path = data_home / '20news-bydate.pkz'

    if file_path.exists():
        original_data = fetch_20newsgroups(
            data_home=str(data_home),
            subset='all',
            categories=categories,
            remove=('headers', 'footers', 'quotes'),
            load_archive=str(file_path)
        )
    
    else:
        original_data = fetch_20newsgroups(
            data_home=data_home,
            subset='all',
            categories=categories,
            remove=('headers','footers','quotes'),
            download_if_missing=True
        )

    # Make tfidf dataset
    n_words=200
    vectorizer = CountVectorizer(
    max_features=n_words, 
    binary=True,
    analyzer="word", 
    stop_words="english",
    strip_accents ="ascii",
    token_pattern=r'(?u)\b[A-Za-z][A-Za-z]+\b' #This token ignores words with numbers and requires words to have lenght>=2
    )

    vectors = vectorizer.fit_transform(original_data.data)
    instances = vectors.toarray()
    classes = np.asarray(list(map(lambda name: 1 if pos in name else 0 if neg in name else np.NaN, original_data.filenames))).reshape(-1,1)
    # print(instances.shape, classes.shape)

    data = np.concatenate([instances,classes], axis=1)
    # scikit-learn >0.20.0 get_feature_names-> get_feature_names_out 
    df = pd.DataFrame(data, columns=(np.append(vectorizer.get_feature_names_out(),["class"]))).dropna()
    
    # N P distribution
    df["class"].value_counts()

    # normalize
    for column in df.columns.values:
        df[column]=pd.to_numeric(df[column])
    
    normalized_df=(df.astype(float)-df.min())/(df.max()-df.min())*2-1
    normalized_df["class"] = df["class"]
    df = normalized_df
    
    df.head()


    # move class to back

    cols = list(df.columns.values) # Make a list of all of the columns in the df
    cols.pop(cols.index('class')) # Remove class from list
    df = df[cols+['class']]
    
    df.head()

    # 分离 features 和 targets
    X_cleaned = df.iloc[:, :-1]
    y_cleaned = df.iloc[:, -1]

    # 转换为 PyTorch 张量
    features = torch.Tensor(X_cleaned.to_numpy())
    targets = torch.Tensor(y_cleaned.to_numpy()).flatten()
    print(f'{features.shape}, p:{int(torch.sum(targets).item())}, n:{torch.sum(targets != 1).item()}')

    return features, targets

def get_minst_dataset():
    data_home = Path('./')
    file_path = data_home / 'mnist_784.pkz'

    if file_path.exists():
        return load('mnist_784.pkz')
    
    x, y = fetch_openml("mnist_784",
                        version=1,
                        return_X_y=True,
                        parser='auto',
                        as_frame=False)

    features = np.reshape(x, (x.shape[0], 1, 28, 28)) / 255.
    # convert_to_binary_label
    targets = np.where(np.isin(y, ['0', '2', '4', '6', '8']), 0, 1)

    dump((features,targets), 'mnist_784.pkz')
    
    return features,targets

def get_cifar_10():
    """
    CIFAR-10, the positive dataset is formed by ‘airplane’,
    ‘automobile’, ‘ship’ and ‘truck’, and the negative dataset is formed by ‘bird’, ‘cat’, ‘deer’, ‘dog’,
    ‘frog’ and ‘horse’.
    """
    data_home = Path('./')
    file_path = data_home / 'CIFAR_10.pkz'

    if file_path.exists():
        return load('CIFAR_10.pkz')
    
    x, y = fetch_openml("CIFAR_10",
                        version=1,
                        return_X_y=True,
                        parser='auto',
                        as_frame=False)
    features = np.reshape(x , (np.shape(x)[0], 3, 32, 32)).astype(np.float32)
    targets = np.where(np.isin(y, ['2', '3', '4', '5', '6','7']), 0, 1)

    
    dump((features,targets), 'CIFAR_10.pkz')
    return features, targets

    



In [127]:
# get_cifar_10()
# get_news_dataset_by_categories()
# get_uci_dataset_by_id()
features,targets = get_minst_dataset()

In [128]:
features,targets

(array([[[[ 26.,  17.,  13., ...,  15.,  24.,  22.],
          [ 20.,  13.,  13., ...,  19.,  21.,  29.],
          [ 14.,  13.,  13., ...,  17.,  25.,  31.],
          ...,
          [ 90.,  34.,  28., ...,  23.,  16.,   9.],
          [ 79.,  58.,  32., ...,  14.,  16.,  10.],
          [128.,  58.,  25., ...,  13.,  12.,  13.]],
 
         [[ 23.,  14.,   9., ...,  14.,  24.,  21.],
          [ 17.,  10.,   9., ...,  17.,  20.,  29.],
          [ 11.,  10.,   9., ...,  16.,  24.,  31.],
          ...,
          [109.,  64.,  54., ...,  20.,  13.,   6.],
          [105.,  96.,  68., ...,  11.,  13.,   7.],
          [157.,  93.,  60., ...,  10.,   9.,  10.]],
 
         [[ 32.,  25.,  24., ...,  28.,  37.,  34.],
          [ 26.,  22.,  24., ...,  35.,  35.,  39.],
          [ 20.,  21.,  23., ...,  32.,  38.,  42.],
          ...,
          [137.,  95.,  90., ...,  37.,  30.,  23.],
          [141., 139., 110., ...,  28.,  30.,  24.],
          [196., 149., 106., ...,  27.,  26.,  2