In [13]:
import numpy as np
from pandas import read_csv as read

WINE_DATA_PATH = "./resources/winequality-red.csv"
CANCER_DATA_PATH = "./resources/breast-cancer-wisconsin.data"
QUALITY_INDEX = 11

wine_dataset = np.array(read(WINE_DATA_PATH, delimiter=";"))
cancer_dataset = np.array(read(CANCER_DATA_PATH, delimiter=",", header=None))

# checking for missing values in the wine dataset
# if the resulting array is not empty
if np.argwhere(np.isnan(wine_dataset)).size > 0:
    # deleting rows with missing values
    #   np.isnan  - returns boolean with True where NaN, and False elsewhere;
    #   .any(axis=1)  - reduces an m*n array to n with an logical or operation on the whole rows;
    #   ~  - inverts True/False
    wine_dataset = np.array(wine_dataset[~np.isnan(wine_dataset).any(axis=1)])

# converting quality ratings of wines to binary values
wine_dataset[:, QUALITY_INDEX][wine_dataset[:, QUALITY_INDEX] <= 5] = 0
wine_dataset[:, QUALITY_INDEX][wine_dataset[:, QUALITY_INDEX] >= 6] = 1

# checking for missing/malformed values in the cancer dataset
if np.argwhere(cancer_dataset == '?').size > 0:
    cancer_dataset = cancer_dataset[~(cancer_dataset == '?').any(axis=1)]
# converting string-values to int type
cancer_dataset = cancer_dataset.astype(int)

print(wine_dataset)
print(cancer_dataset)

[[ 7.4    0.7    0.    ...  0.56   9.4    0.   ]
 [ 7.8    0.88   0.    ...  0.68   9.8    0.   ]
 [ 7.8    0.76   0.04  ...  0.65   9.8    0.   ]
 ...
 [ 6.3    0.51   0.13  ...  0.75  11.     1.   ]
 [ 5.9    0.645  0.12  ...  0.71  10.2    0.   ]
 [ 6.     0.31   0.47  ...  0.66  11.     1.   ]]
[[1000025       5       1 ...       1       1       2]
 [1002945       5       4 ...       2       1       2]
 [1015425       3       1 ...       1       1       2]
 ...
 [ 888820       5      10 ...      10       2       4]
 [ 897471       4       8 ...       6       1       4]
 [ 897471       4       8 ...       4       1       4]]
