In [306]:
import pandas as pd

In [307]:
PROJECT_PATH = '/home/adam/Desktop/ml_labs'

In [308]:
datasets_data = {
    'iris': {
        'columns': [
            'sepal_length',
            'sepal_width',
            'petal_length',
            'petal_width',
            'class'
        ],
        'y_column': 'class',
        'class_mapping': {
            0: 'Iris-setosa',
            1: 'Iris-versicolor',
            2: 'Iris-virginica'
        }
    },
    'glass': {
        'columns': [
            'Id',
            'RI',
            'Na',
            'Mg',
            'Al',
            'Si',
            'K',
            'Ca',
            'Ba',
            'Fe',
            'type_of_glass'
        ],
        'y_column': 'type_of_glass',
        'class_mapping': None,
        'class_mapping': {
            1: 'building_windows_float_processed',
            2: 'building_windows_non_float_processed',
            3: 'vehicle_windows_float_processed',
            4: 'vehicle_windows_non_float_processed (none in this database)',
            5: 'containers',
            6: 'tableware',
            7: 'headlamps',
        }
    },
    'wine': {
        'columns': [
            'class',
            'alcohol',
            'malic_acid',
            'ash',
            'alcalinity_of_ash',
            'magnesium',
            'total_phenols',
            'flavanoids',
            'nonflavanoid_phenols',
            'proanthocyanins',
            'color_intensity',
            'hue',
            'OD280/OD315_of_diluted_wines',
            'proline'
        ],
        'y_column': 'class',
        'class_mapping': {
            1: 'class_1',
            2: 'class_2',
            3: 'class_3',
        }
    },
}

In [309]:
def perform_analysis(dataset_name):
    assert dataset_name in ['iris', 'glass', 'wine']
    df = pd.read_csv(
        '{}/lab1/datasets/{}/{}.data'.format(
            PROJECT_PATH,
            dataset_name,
            dataset_name
        ),
        header=None
    )
    
    df.columns = datasets_data[dataset_name]['columns']
    y_col = datasets_data[dataset_name]['y_column']
    
    x_cols = datasets_data[dataset_name]['columns'].copy()
    x_cols.remove(datasets_data[dataset_name]['y_column'])
    
    if 'Id' in x_cols:
        x_cols.remove('Id')
        
    if dataset_name == 'iris':
        class_inv_mapping = {v: k for k, v in datasets_data[dataset_name]['class_mapping'].items()}
        df[y_col] = df[y_col].apply(lambda x: class_inv_mapping[x])
    
    df = df[x_cols + [y_col]]
    df.columns = x_cols + ['class_idx']
    
    class_counts = df['class_idx'].value_counts().to_frame()
    class_counts = class_counts.reset_index()
    class_counts.columns = ['class_idx', 'class_count']
    class_counts['class_name'] = class_counts['class_idx'].apply(lambda x: datasets_data[dataset_name]['class_mapping'][x])
    class_counts = class_counts[['class_name', 'class_idx', 'class_count']]
    class_counts['class_perc'] = class_counts['class_count']/class_counts['class_count'].sum()*100
    
    print('#'*30)
    print('DATASET NAME: {}'.format(dataset_name))
    print()
    
    print('CLASS DISTRIBUTION ANALYSIS:')
    print(class_counts)
    print()
    
    print('DATASET ATRIBUTES:')
    print(x_cols)
    print()
    
    print('DATASET ANALYSIS:')
    print(df[x_cols].describe())
    print()

In [310]:
perform_analysis(dataset_name='iris')

##############################
DATASET NAME: iris

CLASS DISTRIBUTION ANALYSIS:
        class_name  class_idx  class_count  class_perc
0   Iris-virginica          2           50   33.333333
1  Iris-versicolor          1           50   33.333333
2      Iris-setosa          0           50   33.333333

DATASET ATRIBUTES:
['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

DATASET ANALYSIS:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000



In [311]:
perform_analysis(dataset_name='glass')

##############################
DATASET NAME: glass

CLASS DISTRIBUTION ANALYSIS:
                             class_name  class_idx  class_count  class_perc
0  building_windows_non_float_processed          2           76   35.514019
1      building_windows_float_processed          1           70   32.710280
2                             headlamps          7           29   13.551402
3       vehicle_windows_float_processed          3           17    7.943925
4                            containers          5           13    6.074766
5                             tableware          6            9    4.205607

DATASET ATRIBUTES:
['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

DATASET ANALYSIS:
               RI          Na          Mg          Al          Si           K  \
count  214.000000  214.000000  214.000000  214.000000  214.000000  214.000000   
mean     1.518365   13.407850    2.684533    1.444907   72.650935    0.497056   
std      0.003037    0.816604    1.442408    0.4992

In [312]:
perform_analysis(dataset_name='wine')

##############################
DATASET NAME: wine

CLASS DISTRIBUTION ANALYSIS:
  class_name  class_idx  class_count  class_perc
0    class_2          2           71   39.887640
1    class_1          1           59   33.146067
2    class_3          3           48   26.966292

DATASET ATRIBUTES:
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'OD280/OD315_of_diluted_wines', 'proline']

DATASET ANALYSIS:
          alcohol  malic_acid         ash  alcalinity_of_ash   magnesium  \
count  178.000000  178.000000  178.000000         178.000000  178.000000   
mean    13.000618    2.336348    2.366517          19.494944   99.741573   
std      0.811827    1.117146    0.274344           3.339564   14.282484   
min     11.030000    0.740000    1.360000          10.600000   70.000000   
25%     12.362500    1.602500    2.210000          17.200000   88.000000   
50%     13.050000  