# correlated_words

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

import bookcave
from classification import ordinal

In [None]:
inputs, Y, categories, levels = bookcave.get_data(
    media={'text'},
    text_source='book',
    text_input='filename',
    categories_mode='soft',
    combine_ratings='max')
texts = inputs['text']

In [None]:
vectorizer = TfidfVectorizer(
    input='filename',
    encoding='utf-8',
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_features=8192,
    norm='l2',
    sublinear_tf=True)

In [None]:
X = vectorizer.fit_transform(texts)

See [Towards Data Science article](https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f).

In [None]:
n = 20
for category_index, category in enumerate(categories):
    print('{}'.format(category))
    y = Y[:, category_index]
    for level_index in range(len(levels[category_index]) - 1):
        y_ordinal = ordinal.to_simple_ordinal(y, level_index)
        scores = chi2(X, y_ordinal.astype(np.bool))  # Fix!!
        indices = np.argsort(scores[0])
        sensitive_words = np.array(vectorizer.get_feature_names())[indices]
        unigrams = [v for v in sensitive_words if len(v.split(' ')) == 1]
        bigrams = [v for v in sensitive_words if len(v.split(' ')) == 2]
        print('  Level >= {}'.format(levels[category_index][level_index + 1]))
        print('    Most correlated unigrams:')
        for unigram in unigrams[-n:]:
            print('      {}'.format(unigram))
        print('    Most correlated bigrams:')
        for bigram in bigrams[-n:]:
            print('      {}'.format(bigram))