# class_probabilities

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import bookcave

In [None]:
_, Y, categories, levels, \
_, _, _, _, categories_df \
= bookcave.get_data({'text'}, text_input='filename', only_categories={1, 3, 5, 6}, return_meta=True)

In [None]:
categories

In [None]:
levels

Compute conditional probabilities for every level to every other level (in different categories).

In [None]:
ratings = ['All Ages', 'Mild', 'Mild+', 'Moderate', 'Moderate+', 'Adult', 'Adult+']
rating_to_index = {rating: i for i, rating in enumerate(ratings)}
rating_colors = ['royalblue', 'mediumseagreen', 'darkseagreen', 'yellowgreen', 'yellow', 'orange', 'orangered']

def cond_proba(do_plot=False, sep_len=72):
    for category_index, category_levels in enumerate(levels):
        category = categories[category_index]
        if category_index != 0:
            if sep_len > 1:
                print()
                print('=' * sep_len)
            print()
        print('{}'.format(categories[category_index]))
        
        y = Y[:, category_index]
        
        for level_index, level in enumerate(category_levels):
            level_match_indices = [i for i, value in enumerate(y) if value == level_index]
            p_category = len(level_match_indices) / len(y)
            
            if level_index != 0:
                if sep_len > 1:
                    print()
                    print('  ' + '-' * (sep_len - 2))
            print()
            print('  {} ({:.2%} of category)'.format(level, p_category))
            
            for other_category_index, other_category_levels in enumerate(levels):
                if category_index == other_category_index:
                    continue
                
                other_category = categories[other_category_index]
                print()
                print('    {}'.format(other_category))
                
                y_other = Y[:, other_category_index]
                bincount = np.bincount(y_other[level_match_indices], minlength=len(other_category_levels))
                ps = [count / len(level_match_indices) for count in bincount]
                
                if do_plot:
                    # Get rating-associated colors.
                    category_level_df = categories_df[categories_df['category'].str.match(other_category)]
                    if category_level_df['rating'].isnull().any():
                        color = 'r'
                    else:
                        color = [rating_colors[rating_to_index[row['rating']]] for _, row in category_level_df.iterrows()]
                        
                    fig, ax = plt.subplots()
                    bars = ax.bar(other_category_levels, bincount, color=color)
                    ax.set_title('P(C in "{}" | "{}"="{}")'.format(other_category, category, level))
                    ax.set_ylabel('# of books')
                    ax.set_xticklabels(other_category_levels, rotation=45, ha='right')
                    plt.show()
                    
                for other_level_index, other_level in enumerate(other_category_levels):
                    p = ps[other_level_index]
                    print('      {:.4f} = P("{}" | "{}")'.format(p, other_level, level))

In [None]:
cond_proba(do_plot=True)