In [24]:
pip install ucimlrepo



In [25]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

In [26]:
breast_cancer_wisconsin_diagnostic.data

{'ids':            ID
 0      842302
 1      842517
 2    84300903
 3    84348301
 4    84358402
 ..        ...
 564    926424
 565    926682
 566    926954
 567    927241
 568     92751
 
 [569 rows x 1 columns],
 'features':      radius1  texture1  perimeter1   area1  smoothness1  compactness1  \
 0      17.99     10.38      122.80  1001.0      0.11840       0.27760   
 1      20.57     17.77      132.90  1326.0      0.08474       0.07864   
 2      19.69     21.25      130.00  1203.0      0.10960       0.15990   
 3      11.42     20.38       77.58   386.1      0.14250       0.28390   
 4      20.29     14.34      135.10  1297.0      0.10030       0.13280   
 ..       ...       ...         ...     ...          ...           ...   
 564    21.56     22.39      142.00  1479.0      0.11100       0.11590   
 565    20.13     28.25      131.20  1261.0      0.09780       0.10340   
 566    16.60     28.08      108.30   858.1      0.08455       0.10230   
 567    20.60     29.33      140.1

In [27]:
breast_cancer_wisconsin_diagnostic.data.targets.value_counts()

Diagnosis
B            357
M            212
dtype: int64

In [19]:
import pandas as pd
import numpy as np

# Read the dataset into a new list and shuffle it with random
np.random.seed(17)

data = breast_cancer_wisconsin_diagnostic.data.original

data_shuffled = data.sample(frac=1, random_state=17).reset_index(drop=True)

# 将数据集分成5个部分进行交叉折叠验证
# k = 5
# fold_size = len(data_shuffled) // k
# folds = [data_shuffled.iloc[i * fold_size:(i + 1) * fold_size] for i in range(k)]


# data_shuffled.head(), [fold.shape for fold in folds]

subsets = np.array_split(data_shuffled, 5)

subset_shapes = [subset.shape for subset in subsets]

subsets



[          ID  radius1  texture1  perimeter1   area1  smoothness1  \
 0    9010258   12.560     19.07       81.92   485.8      0.08760   
 1     905520   11.040     16.83       70.92   373.2      0.10770   
 2     874662   11.810     17.39       75.27   428.9      0.10070   
 3     901028   13.870     16.21       88.52   593.7      0.08743   
 4     871641   11.080     14.71       70.21   372.7      0.10060   
 ..       ...      ...       ...         ...     ...          ...   
 109   897630   18.770     21.43      122.90  1092.0      0.09116   
 110   911654   14.200     20.53       92.41   618.4      0.08931   
 111   924934   10.290     27.61       65.67   321.4      0.09030   
 112   862261    9.787     19.94       62.11   294.5      0.10240   
 113   906539   11.570     19.04       74.20   409.7      0.08546   
 
      compactness1  concavity1  concave_points1  symmetry1  ...  texture3  \
 0         0.10380    0.103000         0.043910     0.1533  ...     22.43   
 1         0.078

In [20]:
data_shuffled

Unnamed: 0,ID,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis
0,9010258,12.56,19.07,81.92,485.8,0.08760,0.10380,0.10300,0.043910,0.1533,...,22.43,89.02,547.4,0.1096,0.20020,0.23880,0.09265,0.2121,0.07188,B
1,905520,11.04,16.83,70.92,373.2,0.10770,0.07804,0.03046,0.024800,0.1714,...,26.44,79.93,471.4,0.1369,0.14820,0.10670,0.07431,0.2998,0.07881,B
2,874662,11.81,17.39,75.27,428.9,0.10070,0.05562,0.02353,0.015530,0.1718,...,26.48,79.57,489.5,0.1356,0.10000,0.08803,0.04306,0.3200,0.06576,B
3,901028,13.87,16.21,88.52,593.7,0.08743,0.05492,0.01502,0.020880,0.1424,...,25.58,96.74,694.4,0.1153,0.10080,0.05285,0.05556,0.2362,0.07113,B
4,871641,11.08,14.71,70.21,372.7,0.10060,0.05743,0.02363,0.025830,0.1566,...,16.82,72.01,396.5,0.1216,0.08240,0.03938,0.04306,0.1902,0.07313,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,893988,11.54,10.72,73.73,409.1,0.08597,0.05969,0.01367,0.008907,0.1833,...,12.87,81.23,467.8,0.1092,0.16260,0.08324,0.04715,0.3390,0.07434,B
565,905189,16.14,14.86,104.30,800.0,0.09495,0.08501,0.05500,0.045280,0.1735,...,19.58,115.90,947.9,0.1206,0.17220,0.23100,0.11290,0.2778,0.07012,B
566,90317302,10.26,12.22,65.75,321.6,0.09996,0.07542,0.01923,0.019680,0.1800,...,15.65,73.23,394.5,0.1343,0.16500,0.08615,0.06696,0.2937,0.07722,B
567,869224,12.90,15.92,83.74,512.2,0.08677,0.09509,0.04894,0.030880,0.1778,...,21.82,97.17,643.8,0.1312,0.25480,0.20900,0.10120,0.3549,0.08118,B


In [28]:
import pandas as pd

df = pd.DataFrame(data_shuffled)
df['ID'] = data_shuffled['ID']

# Export the DataFrame to a CSV file
csv_filename = "data.csv"
df.to_csv(csv_filename, index=False)  # Set index=False to not write row indices

csv_filename

'original_data.csv'

In [11]:
# Assuming the last column of data_shuffled represents the class labels
# and that there are 10 unique classes (0-9).

# Step 1: Augment input vectors with a bias term
# Adding 1 at the end of each feature vector
data_with_bias = [row + [1] for row in data_shuffled.values]

# Step 2: Scale attribute values linearly to the range [-1, 1]
# Ignoring the last column which is assumed to be the label
min_values = [min(column) for column in zip(*data_with_bias[:-1])]
max_values = [max(column) for column in zip(*data_with_bias[:-1])]
range_values = [max_val - min_val for max_val, min_val in zip(max_values, min_values)]

# Adding a small constant to the range to avoid division by zero
epsilon = 1e-6
range_values = [x if x > epsilon else epsilon for x in range_values]

# Scaling the attributes excluding the last column which is the label
scaled_data = []
for row in data_with_bias:
    scaled_row = [
        2 * (x - min_val) / (range_val + epsilon) - 1
        if i < len(row) - 1 else x
        for i, (x, min_val, range_val) in enumerate(zip(row, min_values, range_values))
    ]
    scaled_data.append(scaled_row)

# Step 3: Convert labels into one-hot encoded vectors
# Assuming that the last element of each row is the class label
num_classes = 10  # Number of unique classes
one_hot_encoded_data = []
for row in scaled_data:
    # Extract the label (assumed to be the last element)
    label = int(row[-1])
    # One-hot encode the label
    one_hot_label = [1 if i == label else 0 for i in range(num_classes)]
    # Replace the label with its one-hot encoded vector
    new_row = row[:-1] + one_hot_label
    one_hot_encoded_data.append(new_row)

# Verify the transformation by checking the length and some rows
len_one_hot_encoded_data = len(one_hot_encoded_data)
sample_row = one_hot_encoded_data[0]

len_one_hot_encoded_data, sample_row[:5], sample_row[-10:]  # Show first 5 and last 10 elements of the sample row


TypeError: ignored