### Import Packages

In [16]:
import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo 

### Fetch Datasets from UCI repository

In [17]:
wisconsin = fetch_ucirepo(id=17) 
cmc = fetch_ucirepo(id=30) 
dermatology = fetch_ucirepo(id=33)
nursery = fetch_ucirepo(id=76) 

### Transform to Dataframe

In [18]:
def ucirepo_to_df(ucirepo_data):
    df = pd.DataFrame(ucirepo_data.data.features, columns=ucirepo_data.data.feature_names)
    df['class'] = ucirepo_data.data.targets
    return df

In [22]:
wisconsin_df = ucirepo_to_df(wisconsin)
wisconsin_df

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,class
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,M
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,M
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,M
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,M
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,M
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,M
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,M
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,M


In [19]:
# transform datasets to pandas dataframes
wisconsin_df = ucirepo_to_df(wisconsin)
cmc_df = ucirepo_to_df(cmc)
dermatology_df = ucirepo_to_df(dermatology)
nursery_df = ucirepo_to_df(nursery)
datasets = {
    "wisconsin": wisconsin_df,
    "cmc": cmc_df,
    "dermatology": dermatology_df,
    "nursery": nursery_df
}


### Datasets Infos

In [20]:
def get_datasets_info():
    info = []
    for name, df in datasets.items():
        X, y = df.drop(columns=['class']), df['class']
        
        num_instances = len(df)
        num_features = X.shape[1]
        num_continuous_features = X.select_dtypes(include=[np.number]).shape[1]
        num_categorical_features = X.select_dtypes(exclude=[np.number]).shape[1]
        num_classes = y.nunique()
        
        info.append({
            'dataset': name,
            '#instances': num_instances,
            '#features': num_features,
            '#conti_features': num_continuous_features,
            '#categ_features': num_categorical_features,
            '#classes': num_classes
        })
        
    datasets_info = pd.DataFrame(info)

    return datasets_info

In [21]:
get_datasets_info()

Unnamed: 0,dataset,#instances,#features,#conti_features,#categ_features,#classes
0,wisconsin,569,30,30,0,2
1,cmc,1473,9,9,0,3
2,dermatology,366,34,34,0,6
3,nursery,12960,8,0,8,5


### Missing values, duplicates

In [9]:
def get_datasets_info2():
    info = []
    for name, df in datasets.items():
        num_duplicates = df.duplicated().sum()
        num_missing = df.isnull().sum().sum()
        info.append({
            'dataset': name,
            '#duplicates': num_duplicates,
            '#missing': num_missing,
        })

    datasets_info2 = pd.DataFrame(info)

    return datasets_info2   

In [10]:
get_datasets_info2()

Unnamed: 0,dataset,#duplicates,#missing
0,balance_scale,0,0


### Remove duplicates

In [121]:
for name, df in datasets.items():
    datasets[name] = df.drop_duplicates().reset_index(drop=True)

### Handle missing values

In [122]:
datasets['ljubljana'] = datasets['ljubljana'].fillna('unknown')
datasets['mushroom'] = datasets['mushroom'].fillna('unknown')
datasets['dermatology'] = datasets['dermatology'].fillna('unknown')

In [123]:
get_datasets_info2()

Unnamed: 0,dataset,#duplicates,#missing
0,ljubljana,0,0
1,wisconsin,0,0
2,car,0,0
3,connect4,0,0
4,cmc,0,0
5,dermatology,0,0
6,mushroom,0,0
7,nursery,0,0
8,tictactoe,0,0


In [124]:
for name, df in datasets.items():
    df.to_csv(f'/home/adel/Documents/Code/Ant-Miner/datasets/new/{name}.csv', index=False)