In [1]:
# !pip install pandas
# !pip install sklearn
# !pip install xlrd

In [2]:
import math
import pandas as pd
from scipy.io import arff

In [3]:
BREAST_TISSUE = 'breast_tissue_data.csv'
CAESARIAN = 'caesarian.arff'

In [4]:
def fill_database_in_order_to_have_pow_2_columns(database):
    num_of_attr_columns = len(database.columns)
    num_of_bits_needed_for_columns = math.ceil(math.log(num_of_attr_columns,2))
    num_of_constants_columns = math.floor(math.pow(2, num_of_bits_needed_for_columns)) - num_of_attr_columns

    for i in range(num_of_constants_columns):
        database["cte" + str(i)] = 1
    return database

In [5]:
def get_normalized_dataset(dataset_name, rename_breast_tissue = True):
    """ Gets the dataset with dataset_name and normalize every float value from 0 - 1 """
    """ For breast tissue dataset, 'car' = 0, 'fad' = 1, 'mas' = 2, 'gla' = 3, 'con' = 4, 'adi' = 5 """
    
    folder_name = dataset_name.partition('.')[0]
    dataset_path = 'datasets/' + dataset_name
    
    if '.csv' in dataset_name:
        dataset = pd.read_csv(dataset_path)
    elif '.arff' in dataset_name:
        data = arff.loadarff(dataset_path)
        dataset = pd.DataFrame(data[0])
        
    for i in dataset.columns:
        first_val = dataset.loc[0, i]
        if type(first_val) != type('') or any(char.isdigit() for char in first_val):
            dataset[i] = dataset[i].astype(float)
            dataset[i] = (dataset[i] - dataset[i].min()) / (dataset[i].max() - dataset[i].min())

    if 'breast_tissue' in dataset_name and rename_breast_tissue:
        class_col = dataset.columns[-1]
        dataset.loc[dataset.Class == 'car', str(class_col)] = 0
        dataset.loc[dataset.Class == 'fad', str(class_col)] = 1
        dataset.loc[dataset.Class == 'mas', str(class_col)] = 2
        dataset.loc[dataset.Class == 'gla', str(class_col)] = 3
        dataset.loc[dataset.Class == 'con', str(class_col)] = 4
        dataset.loc[dataset.Class == 'adi', str(class_col)] = 5

    return dataset

In [6]:
def get_splitted_and_normalized_dataset(dataset_name, rename_breast_tissue = True):
    dataset = get_normalized_dataset(dataset_name, rename_breast_tissue)
    dataset_x = dataset.iloc[:, :-1]
    dataset_x = fill_database_in_order_to_have_pow_2_columns(dataset_x)
    dataset_y = dataset.iloc[:, -1]
    return dataset_x, dataset_y

In [7]:
def get_breast_tissue_dataset_translated_to_each_class():
    """ Creates copies of the original 'breast tissue' dataset for each possible tissue classification in order for the 
        dataset be classified as 'belongs to class X' or 'doesn't belong to class X' - turning the problem of multiple 
        classification into smaller binaries classifications"""
    
    dataset = get_normalized_dataset(BREAST_TISSUE, False)
    class_col = dataset.columns[-1]
    diff_classes = dataset[class_col].unique()
    original_dataset = get_normalized_dataset(BREAST_TISSUE, False)
    
    dataset_x = original_dataset.iloc[:, :-1]
    dataset_x = fill_database_in_order_to_have_pow_2_columns(dataset_x)
    dataset_y = original_dataset.iloc[:, -1]
    original_dataset = pd.concat([dataset_x, dataset_y], axis=1)
    
    datasets = {}
    for tissue_class in diff_classes:
        dataset = original_dataset.copy()
        dataset.loc[dataset.Class != tissue_class, str(class_col)] = 0
        dataset.loc[dataset.Class == tissue_class, str(class_col)] = 1
        datasets[tissue_class] = dataset
        
    return datasets, original_dataset

In [8]:
x, y = get_splitted_and_normalized_dataset(CAESARIAN)
pd.concat([x, y], axis=1).head(10)

Unnamed: 0,Age,Delivery number,Delivery time,Blood of Pressure,Heart Problem,cte0,cte1,cte2,Caesarian
0,0.217391,0.0,0.0,1.0,0.0,1,1,1,0.0
1,0.391304,0.333333,0.0,0.5,0.0,1,1,1,1.0
2,0.391304,0.333333,0.5,0.5,0.0,1,1,1,0.0
3,0.478261,0.0,0.0,1.0,0.0,1,1,1,0.0
4,0.217391,0.333333,0.0,0.5,0.0,1,1,1,1.0
5,0.391304,0.0,0.5,0.0,0.0,1,1,1,0.0
6,0.434783,0.333333,0.0,0.5,0.0,1,1,1,0.0
7,0.652174,0.666667,0.0,0.5,0.0,1,1,1,1.0
8,0.478261,0.333333,0.0,0.5,0.0,1,1,1,0.0
9,0.434783,0.0,0.5,0.5,0.0,1,1,1,1.0


In [9]:
x, y = get_splitted_and_normalized_dataset(BREAST_TISSUE)
pd.concat([x, y], axis=1).head(10)

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.12201,0.233069,0.155812,1,1,1,1,1,1,1,0
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.09931,1,1,1,1,1,1,1,0
2,0.166437,0.636226,0.243137,0.234871,0.06776,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,0
3,0.102707,0.660444,0.660131,0.113042,0.03057,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,0
4,0.096341,0.5444,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,0
5,0.106367,0.398083,0.30719,0.094825,0.01379,0.11862,0.097607,0.118507,0.109831,1,1,1,1,1,1,1,0
6,0.069505,0.380928,0.223529,0.05268,0.006417,0.088274,0.064781,0.075799,0.074069,1,1,1,1,1,1,1,0
7,0.064026,0.409183,0.475817,0.068864,0.009666,0.108276,0.073194,0.093145,0.074545,1,1,1,1,1,1,1,0
8,0.136077,0.580222,0.546405,0.158022,0.046528,0.263102,0.178716,0.175698,0.172585,1,1,1,1,1,1,1,0
9,0.11865,0.59889,0.614379,0.146316,0.034618,0.208277,0.166042,0.164602,0.156334,1,1,1,1,1,1,1,0


In [10]:
breast_tissue_datasets, original_dataset = get_breast_tissue_dataset_translated_to_each_class()

In [11]:
original_dataset.head(10)

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.12201,0.233069,0.155812,1,1,1,1,1,1,1,car
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.09931,1,1,1,1,1,1,1,car
2,0.166437,0.636226,0.243137,0.234871,0.06776,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,car
3,0.102707,0.660444,0.660131,0.113042,0.03057,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,car
4,0.096341,0.5444,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,car
5,0.106367,0.398083,0.30719,0.094825,0.01379,0.11862,0.097607,0.118507,0.109831,1,1,1,1,1,1,1,car
6,0.069505,0.380928,0.223529,0.05268,0.006417,0.088274,0.064781,0.075799,0.074069,1,1,1,1,1,1,1,car
7,0.064026,0.409183,0.475817,0.068864,0.009666,0.108276,0.073194,0.093145,0.074545,1,1,1,1,1,1,1,car
8,0.136077,0.580222,0.546405,0.158022,0.046528,0.263102,0.178716,0.175698,0.172585,1,1,1,1,1,1,1,car
9,0.11865,0.59889,0.614379,0.146316,0.034618,0.208277,0.166042,0.164602,0.156334,1,1,1,1,1,1,1,car


In [12]:
breast_tissue_datasets['car']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.122010,0.233069,0.155812,1,1,1,1,1,1,1,1
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.099310,1,1,1,1,1,1,1,1
2,0.166437,0.636226,0.243137,0.234871,0.067760,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,1
3,0.102707,0.660444,0.660131,0.113042,0.030570,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,1
4,0.096341,0.544400,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.703374,0.273461,0.321569,0.479573,0.229445,0.464459,0.458088,0.494295,0.708496,1,1,1,1,1,1,1,0
102,0.925844,0.543895,0.513725,1.000000,1.000000,1.000000,0.959329,1.000000,0.916294,1,1,1,1,1,1,1,0
103,0.555061,0.172048,0.000000,0.399788,0.072157,0.168441,0.223679,0.447287,0.487224,1,1,1,1,1,1,1,0
104,0.814609,0.094349,0.380392,0.158842,0.028759,0.158987,0.398763,0.059638,0.849910,1,1,1,1,1,1,1,0


In [13]:
breast_tissue_datasets['fad']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.122010,0.233069,0.155812,1,1,1,1,1,1,1,0
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.099310,1,1,1,1,1,1,1,0
2,0.166437,0.636226,0.243137,0.234871,0.067760,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,0
3,0.102707,0.660444,0.660131,0.113042,0.030570,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,0
4,0.096341,0.544400,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.703374,0.273461,0.321569,0.479573,0.229445,0.464459,0.458088,0.494295,0.708496,1,1,1,1,1,1,1,0
102,0.925844,0.543895,0.513725,1.000000,1.000000,1.000000,0.959329,1.000000,0.916294,1,1,1,1,1,1,1,0
103,0.555061,0.172048,0.000000,0.399788,0.072157,0.168441,0.223679,0.447287,0.487224,1,1,1,1,1,1,1,0
104,0.814609,0.094349,0.380392,0.158842,0.028759,0.158987,0.398763,0.059638,0.849910,1,1,1,1,1,1,1,0


In [14]:
breast_tissue_datasets['mas']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.122010,0.233069,0.155812,1,1,1,1,1,1,1,0
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.099310,1,1,1,1,1,1,1,0
2,0.166437,0.636226,0.243137,0.234871,0.067760,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,0
3,0.102707,0.660444,0.660131,0.113042,0.030570,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,0
4,0.096341,0.544400,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.703374,0.273461,0.321569,0.479573,0.229445,0.464459,0.458088,0.494295,0.708496,1,1,1,1,1,1,1,0
102,0.925844,0.543895,0.513725,1.000000,1.000000,1.000000,0.959329,1.000000,0.916294,1,1,1,1,1,1,1,0
103,0.555061,0.172048,0.000000,0.399788,0.072157,0.168441,0.223679,0.447287,0.487224,1,1,1,1,1,1,1,0
104,0.814609,0.094349,0.380392,0.158842,0.028759,0.158987,0.398763,0.059638,0.849910,1,1,1,1,1,1,1,0


In [15]:
breast_tissue_datasets['gla']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.122010,0.233069,0.155812,1,1,1,1,1,1,1,0
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.099310,1,1,1,1,1,1,1,0
2,0.166437,0.636226,0.243137,0.234871,0.067760,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,0
3,0.102707,0.660444,0.660131,0.113042,0.030570,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,0
4,0.096341,0.544400,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.703374,0.273461,0.321569,0.479573,0.229445,0.464459,0.458088,0.494295,0.708496,1,1,1,1,1,1,1,0
102,0.925844,0.543895,0.513725,1.000000,1.000000,1.000000,0.959329,1.000000,0.916294,1,1,1,1,1,1,1,0
103,0.555061,0.172048,0.000000,0.399788,0.072157,0.168441,0.223679,0.447287,0.487224,1,1,1,1,1,1,1,0
104,0.814609,0.094349,0.380392,0.158842,0.028759,0.158987,0.398763,0.059638,0.849910,1,1,1,1,1,1,1,0


In [16]:
breast_tissue_datasets['con']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.122010,0.233069,0.155812,1,1,1,1,1,1,1,0
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.099310,1,1,1,1,1,1,1,0
2,0.166437,0.636226,0.243137,0.234871,0.067760,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,0
3,0.102707,0.660444,0.660131,0.113042,0.030570,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,0
4,0.096341,0.544400,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.703374,0.273461,0.321569,0.479573,0.229445,0.464459,0.458088,0.494295,0.708496,1,1,1,1,1,1,1,0
102,0.925844,0.543895,0.513725,1.000000,1.000000,1.000000,0.959329,1.000000,0.916294,1,1,1,1,1,1,1,0
103,0.555061,0.172048,0.000000,0.399788,0.072157,0.168441,0.223679,0.447287,0.487224,1,1,1,1,1,1,1,0
104,0.814609,0.094349,0.380392,0.158842,0.028759,0.158987,0.398763,0.059638,0.849910,1,1,1,1,1,1,1,0


In [17]:
breast_tissue_datasets['adi']

Unnamed: 0,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P,cte0,cte1,cte2,cte3,cte4,cte5,cte6,Class
0,0.156394,0.506054,0.184314,0.200377,0.038835,0.174272,0.122010,0.233069,0.155812,1,1,1,1,1,1,1,0
1,0.084168,0.620081,0.620915,0.097248,0.017733,0.150875,0.144228,0.109791,0.099310,1,1,1,1,1,1,1,0
2,0.166437,0.636226,0.243137,0.234871,0.067760,0.266496,0.163092,0.266559,0.191871,1,1,1,1,1,1,1,0
3,0.102707,0.660444,0.660131,0.113042,0.030570,0.231744,0.188703,0.115986,0.133036,1,1,1,1,1,1,1,0
4,0.096341,0.544400,0.581699,0.100848,0.018462,0.152308,0.143462,0.114636,0.108175,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.703374,0.273461,0.321569,0.479573,0.229445,0.464459,0.458088,0.494295,0.708496,1,1,1,1,1,1,1,1
102,0.925844,0.543895,0.513725,1.000000,1.000000,1.000000,0.959329,1.000000,0.916294,1,1,1,1,1,1,1,1
103,0.555061,0.172048,0.000000,0.399788,0.072157,0.168441,0.223679,0.447287,0.487224,1,1,1,1,1,1,1,1
104,0.814609,0.094349,0.380392,0.158842,0.028759,0.158987,0.398763,0.059638,0.849910,1,1,1,1,1,1,1,1
