In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
PROJECT_PATH = '/home/adam/Desktop/ml_labs'

In [None]:
datasets_data = {
    'iris': {
        'columns': [
            'sepal_length',
            'sepal_width',
            'petal_length',
            'petal_width',
            'class'
        ],
        'y_column': 'class',
        'class_mapping': {
            0: 'Iris-setosa',
            1: 'Iris-versicolor',
            2: 'Iris-virginica'
        },
        'plot_0_cols': ['sepal_width', 'sepal_length']
    },
    'glass': {
        'columns': [
            'Id',
            'RI',
            'Na',
            'Mg',
            'Al',
            'Si',
            'K',
            'Ca',
            'Ba',
            'Fe',
            'type_of_glass'
        ],
        'y_column': 'type_of_glass',
        'class_mapping': None,
        'class_mapping': {
            1: 'building_windows_float_processed',
            2: 'building_windows_non_float_processed',
            3: 'vehicle_windows_float_processed',
            4: 'vehicle_windows_non_float_processed (none in this database)',
            5: 'containers',
            6: 'tableware',
            7: 'headlamps',
        },
        'plot_0_cols': ['RI', 'Si']
    },
    'wine': {
        'columns': [
            'class',
            'alcohol',
            'malic_acid',
            'ash',
            'alcalinity_of_ash',
            'magnesium',
            'total_phenols',
            'flavanoids',
            'nonflavanoid_phenols',
            'proanthocyanins',
            'color_intensity',
            'hue',
            'OD280/OD315_of_diluted_wines',
            'proline'
        ],
        'y_column': 'class',
        'class_mapping': {
            1: 'class_1',
            2: 'class_2',
            3: 'class_3',
        },
        'plot_0_cols': ['alcohol', 'malic_acid']
    },
}

In [None]:
def perform_analysis(dataset_name):
    assert dataset_name in ['iris', 'glass', 'wine']
    df = pd.read_csv(
        '{}/lab1/datasets/{}/{}.data'.format(
            PROJECT_PATH,
            dataset_name,
            dataset_name
        ),
        header=None
    )
    
    df.columns = datasets_data[dataset_name]['columns']
    y_col = datasets_data[dataset_name]['y_column']
    
    x_cols = datasets_data[dataset_name]['columns'].copy()
    x_cols.remove(datasets_data[dataset_name]['y_column'])
    
    if 'Id' in x_cols:
        x_cols.remove('Id')
        
    if dataset_name == 'iris':
        class_inv_mapping = {v: k for k, v in datasets_data[dataset_name]['class_mapping'].items()}
        df[y_col] = df[y_col].apply(lambda x: class_inv_mapping[x])
    
    df = df[x_cols + [y_col]]
    df.columns = x_cols + ['class_idx']
    
    class_counts = df['class_idx'].value_counts().to_frame()
    class_counts = class_counts.reset_index()
    class_counts.columns = ['class_idx', 'class_count']
    class_counts['class_name'] = class_counts['class_idx'].apply(lambda x: datasets_data[dataset_name]['class_mapping'][x])
    class_counts = class_counts[['class_name', 'class_idx', 'class_count']]
    class_counts['class_perc'] = class_counts['class_count']/class_counts['class_count'].sum()*100
    
    print('#'*30)
    print('DATASET NAME: {}'.format(dataset_name))
    
    print()
    print('CLASS DISTRIBUTION ANALYSIS:')
    
    print(class_counts)
    
    print()
    print('DATASET ATRIBUTES:')
    
    print(x_cols)
    
    print()
    print('DATASET ANALYSIS:')
    
    print(df[x_cols].describe())
    
    print()
    print('SCATTER PLOT')
    
    plt.figure(figsize=(9, 9))
    
    for class_idx in datasets_data[dataset_name]['class_mapping'].keys():
        df_class = df.loc[df['class_idx'] == class_idx]
        class_name = datasets_data[dataset_name]['class_mapping'][class_idx]
        col_0, col1 = datasets_data[dataset_name]['plot_0_cols']
        
        plt.scatter(df_class[col_0], df_class[col1], label=class_name)
    
    plt.legend(loc='lower right')
    plt.xlabel(col_0)
    plt.ylabel(col1)
    
    print()
    print('PAIRGRID PLOT')
    
    df['class_name'] = df['class_idx'].apply(lambda x: datasets_data[dataset_name]['class_mapping'][x])
    g = sns.PairGrid(df[x_cols + ['class_name']], hue="class_name")
    g.map_diag(plt.hist)
    g.map_offdiag(plt.scatter)
    g.add_legend()
    
    print()
    print('PCA SCATTER PLOT')
    
    x = df.loc[:, x_cols].values
    y = df.loc[:,['class_idx']].values
    x = StandardScaler().fit_transform(x)
    
    pca = PCA(n_components=2)
    pca_components = pca.fit_transform(x)
    df_pca = pd.DataFrame(
        data=pca_components,
        columns=['comp_0', 'comp_1']
    )
    df_pca = pd.concat([df_pca, df[['class_idx']]], axis=1)
    
    plt.figure(figsize=(9, 9))

    for class_idx in datasets_data[dataset_name]['class_mapping'].keys():
        df_class = df_pca.loc[df_pca['class_idx'] == class_idx]
        class_name = datasets_data[dataset_name]['class_mapping'][class_idx]
        col_0, col1 = 'comp_0', 'comp_1'
        
        plt.scatter(df_class[col_0], df_class[col1], label=class_name)
    
    plt.legend(loc='lower right')
    plt.xlabel(col_0)
    plt.ylabel(col1)
    

In [None]:
perform_analysis(dataset_name='iris')

In [None]:
perform_analysis(dataset_name='glass')

In [None]:
perform_analysis(dataset_name='wine')