# class_probabilities

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

import bookcave

In [None]:
_, Y, categories, levels, \
_, _, _, _, categories_df \
= bookcave.get_data({'text'}, text_input='filename', only_categories={0, 1, 2, 3, 4, 5, 6}, return_meta=True)

In [None]:
categories

In [None]:
levels

In [None]:
def get_flat_levels():
    for category_levels in levels:
        for level in category_levels:
            yield level

flat_levels = list(get_flat_levels())
for i, flat_level in enumerate(flat_levels):
    print('{:2d}: {}'.format(i, flat_level))

Encode labels as one-hot vectors.

In [None]:
enc = OneHotEncoder(categories='auto', sparse=False)
Y_hot = enc.fit_transform(Y)
n, m = Y_hot.shape
corr = np.eye(m)
for i in range(m - 1):
    for j in range(i + 1, m):
        # count = sum(Y_hot[:, i] == Y_hot[:, j])
        count = 0
        same_count = 0
        for y in Y_hot:
            if y[i] == 1:
                count += 1
                if y[j] == 1:
                    same_count += 1
        corr[i, j] = same_count / count
        corr[j, i] = same_count / count

In [None]:
corr_df = pd.DataFrame(corr)
sns.heatmap(corr_df)
plt.title('Class Conditional Probabilities')
plt.savefig(os.path.join('..', 'figures', 'class_conditional_probabilities.png'), bbox_inches='tight')
plt.show()

Compute conditional probabilities for every level to every other level (in different categories).

In [None]:
ratings = ['All Ages', 'Mild', 'Mild+', 'Moderate', 'Moderate+', 'Adult', 'Adult+']
rating_to_index = {rating: i for i, rating in enumerate(ratings)}
rating_colors = ['royalblue', 'mediumseagreen', 'darkseagreen', 'yellowgreen', 'yellow', 'orange', 'orangered']

def cond_proba(sep_len=72):
    for category_index, category_levels in enumerate(levels):
        category = categories[category_index]
        if category_index != 0:
            if sep_len > 1:
                print()
                print('=' * sep_len)
            print()
        print('{}'.format(categories[category_index]))
        
        y = Y[:, category_index]
        
        for level_index, level in enumerate(category_levels):
            level_match_indices = [i for i, value in enumerate(y) if value == level_index]
            p_category = len(level_match_indices) / len(y)
            
            if level_index != 0:
                if sep_len > 1:
                    print()
                    print('  ' + '-' * (sep_len - 2))
            print()
            print('  {} ({:.2%} of category)'.format(level, p_category))
            
            for other_category_index, other_category_levels in enumerate(levels):
                if category_index == other_category_index:
                    continue
                
                other_category = categories[other_category_index]
                print()
                print('    {}'.format(other_category))
                
                y_other = Y[:, other_category_index]
                bincount = np.bincount(y_other[level_match_indices], minlength=len(other_category_levels))
                ps = [count / len(level_match_indices) for count in bincount]
                    
                for other_level_index, other_level in enumerate(other_category_levels):
                    p = ps[other_level_index]
                    print('      {:.4f} = P("{}" | "{}")'.format(p, other_level, level))

In [None]:
cond_proba()