In [1]:
# !pip install pandas
# !pip install sklearn
# !pip install xlrd

In [2]:
import pandas as pd
from scipy.io import arff

In [None]:
def get_splitted_and_normalized_dataset(dataset_name):
    dataset = get_normalized_dataset(dataset_name)
    dataset_x = dataset.iloc[:, :-1]
    dataset_y = dataset.iloc[:, -1]
    return dataset_x, dataset_y

In [3]:
def get_normalized_dataset(dataset_name):
    dataset_path = 'datasets/' + dataset_name
    if '.csv' in dataset_name:
        dataset = pd.read_csv(dataset_path)
    elif '.arff' in dataset_name:
        data = arff.loadarff(dataset_path)
        dataset = pd.DataFrame(data[0])
        
    for i in dataset.columns:
        first_val = dataset.loc[0, i]
        if type(first_val) != type('') or any(char.isdigit() for char in first_val):
            dataset[i] = dataset[i].astype(float)
            dataset[i] = (dataset[i] - dataset[i].min()) / (dataset[i].max() - dataset[i].min())
    return dataset.sample(frac=1)

In [4]:
def get_breast_tissue_dataset_translated_to_each_class():
    dataset = get_normalized_dataset('breast_tissue_data.csv')
    class_col = dataset.columns[-1]
    diff_classes = dataset[class_col].unique()
    
    datasets = {}
    for tissue_class in diff_classes:
        dataset = dataset = get_normalized_dataset('breast_tissue_data.csv')
        dataset.loc[dataset.Class != tissue_class, str(class_col)] = 0
        dataset.loc[dataset.Class == tissue_class, str(class_col)] = 1
        datasets[tissue_class] = dataset
    
    return datasets

In [5]:
get_normalized_dataset('caesarian.arff').head(10)

Unnamed: 0,Age,Delivery number,Delivery time,Blood of Pressure,Heart Problem,Caesarian
3,0.478261,0.0,0.0,1.0,0.0,0.0
17,0.130435,0.0,1.0,1.0,0.0,1.0
12,0.26087,0.0,0.5,0.5,0.0,0.0
40,0.434783,0.0,0.0,1.0,1.0,1.0
50,0.695652,0.666667,1.0,0.5,1.0,0.0
73,0.652174,0.666667,0.0,0.5,1.0,0.0
33,0.434783,0.333333,0.0,0.5,1.0,1.0
74,0.913043,0.666667,1.0,1.0,1.0,1.0
79,0.304348,0.333333,1.0,0.5,0.0,0.0
21,0.695652,0.333333,0.0,0.0,1.0,1.0


In [6]:
get_normalized_dataset('breast_tissue_data.csv').head(10)

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,Class
34,0.019281,0.463673,0.346405,0.018483,0.001976,0.05579,0.041734,0.038905,0.02159,fad
11,0.147201,0.62109,0.219608,0.191271,0.055897,0.265772,0.160931,0.219418,0.172301,car
97,1.0,0.204339,0.469281,0.539965,0.179567,0.321403,0.678798,0.517117,1.0,adi
56,0.018168,0.443491,0.550327,0.01396,0.000137,0.007149,0.054447,0.023444,0.020072,gla
14,0.141887,0.629667,0.375163,0.224418,0.046245,0.187407,0.132872,0.258133,0.150233,car
76,0.202818,0.082745,0.396078,0.188891,0.002048,0.002315,0.060253,0.22641,0.145663,con
22,0.0348,0.0222,0.294118,0.00857,6.7e-05,0.007842,0.0,0.037413,0.027338,fad
75,0.247312,0.084517,0.128741,0.148853,0.001581,0.002349,0.0403,0.184889,0.19116,con
16,0.061734,0.564077,0.196078,0.058214,0.010855,0.140475,0.085888,0.077113,0.073644,car
43,0.055247,0.270938,0.183007,0.018104,0.002427,0.069027,0.041044,0.038634,0.05617,mas


In [7]:
breast_tissue_datasets = get_breast_tissue_dataset_translated_to_each_class()

In [8]:
breast_tissue_datasets['car']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,Class
63,0.030404,0.397074,0.283660,0.019394,0.001670,0.045992,0.044127,0.039268,0.030740,0
29,0.093437,0.149344,0.282353,0.066978,0.005524,0.061229,0.045766,0.097115,0.089140,0
3,0.102707,0.660444,0.660131,0.113042,0.030570,0.231744,0.188703,0.115986,0.133036,1
22,0.034800,0.022200,0.294118,0.008570,0.000067,0.007842,0.000000,0.037413,0.027338,0
52,0.066119,0.635721,0.943791,0.132437,0.030014,0.196987,0.089729,0.162316,0.098830,0
...,...,...,...,...,...,...,...,...,...,...
58,0.054505,0.215943,0.298039,0.009571,0.000633,0.027717,0.042448,0.023531,0.055976,0
95,0.647757,0.176085,0.546405,0.292724,0.049163,0.153799,0.468948,0.262068,0.643381,0
51,0.063772,0.389506,0.381699,0.044846,0.006576,0.102925,0.076801,0.062503,0.073091,0
76,0.202818,0.082745,0.396078,0.188891,0.002048,0.002315,0.060253,0.226410,0.145663,0


In [9]:
breast_tissue_datasets['fad']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,Class
98,0.825673,0.155399,0.785621,0.342602,0.145052,0.404066,0.766369,0.183059,0.924179,0
105,0.925844,0.166498,0.215686,0.695374,0.228057,0.319152,0.341377,0.748499,0.873300,0
28,0.035966,0.073158,0.343791,0.021718,0.000862,0.022295,0.006323,0.050877,0.033575,1
79,0.516773,0.264379,0.278431,0.594589,0.063079,0.096610,0.234317,0.648869,0.380031,0
24,0.092568,0.316852,0.294118,0.046829,0.005709,0.085935,0.083440,0.062880,0.092998,1
...,...,...,...,...,...,...,...,...,...,...
85,0.629218,0.063068,0.203922,0.269606,0.024859,0.080256,0.139342,0.306670,0.583560,0
62,0.027809,0.396569,0.309804,0.019777,0.002316,0.062648,0.041931,0.040639,0.030380,0
101,0.703374,0.273461,0.321569,0.479573,0.229445,0.464459,0.458088,0.494295,0.708496,0
47,0.099146,0.266902,0.124183,0.092236,0.007096,0.059631,0.054652,0.123603,0.086953,0


In [10]:
breast_tissue_datasets['mas']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,Class
12,0.124501,0.578204,0.237908,0.097006,0.027573,0.238580,0.170095,0.100527,0.144480,0
85,0.629218,0.063068,0.203922,0.269606,0.024859,0.080256,0.139342,0.306670,0.583560,0
90,0.647757,0.192982,0.254260,0.224157,0.074782,0.308403,0.355257,0.208741,0.646559,0
88,0.592139,0.091322,0.321024,0.096768,0.070298,0.619214,0.262370,0.000000,0.753066,0
22,0.034800,0.022200,0.294118,0.008570,0.000067,0.007842,0.000000,0.037413,0.027338,0
...,...,...,...,...,...,...,...,...,...,...
28,0.035966,0.073158,0.343791,0.021718,0.000862,0.022295,0.006323,0.050877,0.033575,0
73,0.202584,0.274975,0.159477,0.179599,0.018772,0.089565,0.099453,0.212915,0.180015,0
29,0.093437,0.149344,0.282353,0.066978,0.005524,0.061229,0.045766,0.097115,0.089140,0
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.122010,0.233069,0.155812,0


In [11]:
breast_tissue_datasets['gla']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,Class
86,0.740452,0.315843,0.831373,0.412825,0.204123,0.477471,1.000000,0.124093,0.843004,0
92,0.629218,0.230071,0.508497,0.328816,0.085724,0.244969,0.490189,0.303464,0.638145,0
90,0.647757,0.192982,0.254260,0.224157,0.074782,0.308403,0.355257,0.208741,0.646559,0
24,0.092568,0.316852,0.294118,0.046829,0.005709,0.085935,0.083440,0.062880,0.092998,0
37,0.034112,0.366297,0.508497,0.017067,0.001479,0.044130,0.063193,0.022854,0.038826,0
...,...,...,...,...,...,...,...,...,...,...
23,0.052651,0.510595,0.277124,0.041440,0.006683,0.111114,0.079843,0.056696,0.060397,0
35,0.015202,0.312815,0.210458,0.000000,0.000000,0.012240,0.023736,0.017052,0.012771,0
56,0.018168,0.443491,0.550327,0.013960,0.000137,0.007149,0.054447,0.023444,0.020072,1
63,0.030404,0.397074,0.283660,0.019394,0.001670,0.045992,0.044127,0.039268,0.030740,1


In [12]:
breast_tissue_datasets['con']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,Class
35,0.015202,0.312815,0.210458,0.000000,0.000000,0.012240,0.023736,0.017052,0.012771,0
54,0.136269,0.332492,0.248366,0.125098,0.014836,0.099075,0.092476,0.153783,0.132232,0
92,0.629218,0.230071,0.508497,0.328816,0.085724,0.244969,0.490189,0.303464,0.638145,0
13,0.097865,0.773966,0.596078,0.146674,0.040103,0.241891,0.157979,0.166780,0.125057,0
47,0.099146,0.266902,0.124183,0.092236,0.007096,0.059631,0.054652,0.123603,0.086953,0
...,...,...,...,...,...,...,...,...,...,...
66,0.027067,0.224016,0.267974,0.000901,0.000053,0.014006,0.023959,0.019085,0.024179,0
70,0.601072,0.116549,0.084967,0.368347,0.017106,0.036690,0.148223,0.413911,0.492281,1
52,0.066119,0.635721,0.943791,0.132437,0.030014,0.196987,0.089729,0.162316,0.098830,0
88,0.592139,0.091322,0.321024,0.096768,0.070298,0.619214,0.262370,0.000000,0.753066,0


In [13]:
breast_tissue_datasets['adi']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,Class
21,0.040044,0.120081,0.300654,0.010640,0.000468,0.020596,0.014714,0.036989,0.033249,0
47,0.099146,0.266902,0.124183,0.092236,0.007096,0.059631,0.054652,0.123603,0.086953,0
17,0.073044,0.513623,0.436601,0.074211,0.017024,0.182828,0.101336,0.092901,0.094566,0
96,0.573600,0.101917,0.205229,0.244089,0.032994,0.120818,0.171141,0.275011,0.533298,1
65,0.147942,0.152876,0.176471,0.032182,0.004380,0.086625,0.059240,0.051451,0.151198,0
...,...,...,...,...,...,...,...,...,...,...
80,0.374050,0.249748,0.257516,0.351927,0.043514,0.112002,0.182319,0.391729,0.312455,0
32,0.021254,0.475782,0.430065,0.016832,0.001556,0.046717,0.053536,0.030419,0.022582,0
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.099310,0
18,0.082314,0.613522,0.661438,0.200775,0.032308,0.143375,0.064546,0.239684,0.121852,0
