# correlated_words

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

from sites.bookcave import bookcave
from classification import ordinal

In [None]:
inputs, Y, categories, levels = bookcave.get_data(
    media={'text'},
    text_source='book',
    text_input='filename',
    categories_mode='soft',
    combine_ratings='max')
texts = inputs['text']
len(texts)

In [None]:
vectorizer = TfidfVectorizer(
    input='filename',
    encoding='utf-8',
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_features=8192,
    norm='l2',
    sublinear_tf=True)

In [None]:
X = vectorizer.fit_transform(texts)

See [Multi Class Text Classification article](https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f).

In [73]:
def print_sensitive_terms(X, Y, topn=10, use_ordinal=True):
    for category_index, category in enumerate(categories):
        print('{}'.format(category))
        y = Y[category_index]
        if use_ordinal:
            for level_index in range(len(levels[category_index]) - 1):
                y_hat = ordinal.to_simple_ordinal(y, level_index)  # .astype(np.bool)
                chi2, pval = sklearn.feature_selection.chi2(X, y_hat)
                indices = np.argsort(chi2)
                sensitive_terms = np.array(vectorizer.get_feature_names())
                print('  >= {}'.format(levels[category_index][level_index + 1]))
                for n, i in enumerate(list(reversed(indices[-topn:]))):
                    print('    {:3d}: {:24s}    chi2: {:5.2f}    p: {:.8f}'.format(n + 1, sensitive_terms[i], chi2[i], pval[i]))
        else:
            chi2, pval = sklearn.feature_selection.chi2(X, y)
            indices = np.argsort(chi2)
            sensitive_terms = np.array(vectorizer.get_feature_names())
            for n, i in enumerate(list(reversed(indices[-topn:]))):
                print('  {:3d}: {:24s}    chi2: {:5.2f}    p: {:.8f}'.format(n + 1, sensitive_terms[i], chi2[i], pval[i]))

In [75]:
print_sensitive_terms(X, Y, topn=20, use_ordinal=False)

crude_humor_language
    1: fuck                        chi2: 50.98    p: 0.00000000
    2: fucking                     chi2: 48.06    p: 0.00000000
    3: shit                        chi2: 43.69    p: 0.00000000
    4: pussy                       chi2: 38.74    p: 0.00000002
    5: fucked                      chi2: 32.96    p: 0.00000033
    6: ass                         chi2: 31.77    p: 0.00000059
    7: cock                        chi2: 28.56    p: 0.00000278
    8: bitch                       chi2: 27.45    p: 0.00000474
    9: damn                        chi2: 26.40    p: 0.00000786
   10: asshole                     chi2: 25.13    p: 0.00001450
   11: bullshit                    chi2: 22.29    p: 0.00005672
   12: bastard                     chi2: 21.09    p: 0.00010092
   13: pissed                      chi2: 20.81    p: 0.00011539
   14: dick                        chi2: 20.49    p: 0.00013408
   15: hell                        chi2: 20.48    p: 0.00013519
   16: breasts     

    1: fuck                        chi2:  7.92    p: 0.01907481
    2: fucking                     chi2:  7.65    p: 0.02181982
    3: shit                        chi2:  6.76    p: 0.03409678
    4: fucked                      chi2:  6.43    p: 0.04005732
    5: bitch                       chi2:  4.88    p: 0.08721588
    6: ass                         chi2:  4.81    p: 0.09030296
    7: sex                         chi2:  4.17    p: 0.12429727
    8: pussy                       chi2:  4.05    p: 0.13206630
    9: auntie lil                  chi2:  3.94    p: 0.13919446
   10: sexual                      chi2:  3.85    p: 0.14596887
   11: asshole                     chi2:  3.72    p: 0.15557359
   12: rhys                        chi2:  3.72    p: 0.15573275
   13: damn                        chi2:  3.38    p: 0.18445572
   14: bullshit                    chi2:  3.36    p: 0.18642067
   15: pissed                      chi2:  3.25    p: 0.19718014
   16: bastard                     chi2: