# correlated_words

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

import bookcave
from classification import ordinal

In [None]:
inputs, Y, categories, levels = bookcave.get_data(
    media={'text'},
    text_source='book',
    text_input='filename',
    categories_mode='soft',
    combine_ratings='max')
texts = inputs['text']
len(texts)

In [None]:
vectorizer = TfidfVectorizer(
    input='filename',
    encoding='utf-8',
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_features=8192,
    norm='l2',
    sublinear_tf=True)

In [None]:
X = vectorizer.fit_transform(texts)

See [Multi Class Text Classification article](https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f).

In [None]:
def print_sensitive_terms(X, Y, topn=10, use_ordinal=True):
    for category_index, category in enumerate(categories):
        print('{}'.format(category))
        y = Y[category_index]
        if use_ordinal:
            for level_index in range(len(levels[category_index]) - 1):
                y_hat = ordinal.to_simple_ordinal(y, level_index)  # .astype(np.bool)
                print_sensitive_terms_(y_hat, topn)
        else:
            print_sensitive_terms_(y, topn)

def print_sensitive_terms_(y_hat, topn=10):
    chi2, pval = sklearn.feature_selection.chi2(X, y_hat)
    indices = np.argsort(chi2)
    sensitive_terms = np.array(vectorizer.get_feature_names())
    print('  >= {}'.format(levels[category_index][level_index + 1]))
    for n, i in enumerate(list(reversed(indices[-topn:]))):
        print('    {:3d}: {:24s}    chi2: {:5.2f}    p: {:.8f}'.format(n + 1, sensitive_terms[i], chi2[i], pval[i]))

In [None]:
print_sensitive_terms(X, Y, topn=4, use_ordinal=False)