### Import Packages

In [1]:
import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo 

### Fetch Datasets from UCI repository

In [None]:
ljubljana = fetch_ucirepo(id=14) 
wisconsin = fetch_ucirepo(id=17) 
car = fetch_ucirepo(id=19) 
connect4 = fetch_ucirepo(id=26) 
cmc = fetch_ucirepo(id=30) 
dermatology = fetch_ucirepo(id=33)
mushroom = fetch_ucirepo(id=73) 
nursery = fetch_ucirepo(id=76) 
tictactoe = fetch_ucirepo(id=101) 

### Transform to Dataframe

In [3]:
def ucirepo_to_df(ucirepo_data):
    df = pd.DataFrame(ucirepo_data.data.features, columns=ucirepo_data.data.feature_names)
    df['class'] = ucirepo_data.data.targets
    return df

In [116]:
# transform datasets to pandas dataframes
ljubljana_df = ucirepo_to_df(ljubljana)
wisconsin_df = ucirepo_to_df(wisconsin)
car_df = ucirepo_to_df(car)
connect4_df = ucirepo_to_df(connect4)
cmc_df = ucirepo_to_df(cmc)
dermatology_df = ucirepo_to_df(dermatology)
mushroom_df = ucirepo_to_df(mushroom)
nursery_df = ucirepo_to_df(nursery)
tictactoe_df = ucirepo_to_df(tictactoe)

datasets = {
    'ljubljana': ljubljana_df,
    'wisconsin': wisconsin_df,
    'car': car_df,
    'connect4': connect4_df,
    'cmc': cmc_df,
    'dermatology': dermatology_df,
    'mushroom': mushroom_df,
    'nursery': nursery_df,
    'tictactoe': tictactoe_df
}

### Datasets Infos

In [117]:
def get_datasets_info():
    info = []
    for name, df in datasets.items():
        X, y = df.drop(columns=['class']), df['class']
        
        num_instances = len(df)
        num_features = X.shape[1]
        num_continuous_features = X.select_dtypes(include=[np.number]).shape[1]
        num_categorical_features = X.select_dtypes(exclude=[np.number]).shape[1]
        num_classes = y.nunique()
        
        info.append({
            'dataset': name,
            '#instances': num_instances,
            '#features': num_features,
            '#conti_features': num_continuous_features,
            '#categ_features': num_categorical_features,
            '#classes': num_classes
        })
        
    datasets_info = pd.DataFrame(info)

    return datasets_info

In [118]:
get_datasets_info()

Unnamed: 0,dataset,#instances,#features,#conti_features,#categ_features,#classes
0,ljubljana,286,9,1,8,2
1,wisconsin,569,30,30,0,2
2,car,1728,6,0,6,4
3,connect4,67557,42,0,42,3
4,cmc,1473,9,9,0,3
5,dermatology,366,34,34,0,6
6,mushroom,8124,22,0,22,2
7,nursery,12960,8,0,8,5
8,tictactoe,958,9,0,9,2


### Missing values, duplicates

In [119]:
def get_datasets_info2():
    info = []
    for name, df in datasets.items():
        num_duplicates = df.duplicated().sum()
        num_missing = df.isnull().sum().sum()
        info.append({
            'dataset': name,
            '#duplicates': num_duplicates,
            '#missing': num_missing,
        })

    datasets_info2 = pd.DataFrame(info)

    return datasets_info2   

In [120]:
get_datasets_info2()

Unnamed: 0,dataset,#duplicates,#missing
0,ljubljana,14,9
1,wisconsin,0,0
2,car,0,0
3,connect4,0,0
4,cmc,48,0
5,dermatology,0,8
6,mushroom,0,2480
7,nursery,0,0
8,tictactoe,0,0


### Remove duplicates

In [121]:
for name, df in datasets.items():
    datasets[name] = df.drop_duplicates().reset_index(drop=True)

### Handle missing values

In [122]:
datasets['ljubljana'] = datasets['ljubljana'].fillna('unknown')
datasets['mushroom'] = datasets['mushroom'].fillna('unknown')
datasets['dermatology'] = datasets['dermatology'].fillna('unknown')

In [123]:
get_datasets_info2()

Unnamed: 0,dataset,#duplicates,#missing
0,ljubljana,0,0
1,wisconsin,0,0
2,car,0,0
3,connect4,0,0
4,cmc,0,0
5,dermatology,0,0
6,mushroom,0,0
7,nursery,0,0
8,tictactoe,0,0


In [124]:
for name, df in datasets.items():
    df.to_csv(f'/home/adel/Documents/Code/Ant-Miner/datasets/new/{name}.csv', index=False)