In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
import math

In [2]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tktra\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\tktra\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
test_data = pd.read_csv('Datasets\Religious_text_test.csv')
test_data = test_data.rename(columns={'# foolishness': 'foolishness'})
test_data.head(5)

Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_data = pd.read_csv('Datasets\Religious_text_train.csv')
train_data = train_data.rename(columns={'Unnamed: 0': 'Chapters'})
train_data.head(5)

Unnamed: 0,Chapters,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,Buddhism_Ch1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Buddhism_Ch2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Buddhism_Ch3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Buddhism_Ch4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Buddhism_Ch5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Training Data Split By Book (with Columns Maintained)

In [5]:
buddhism = train_data[train_data['Chapters'].str.contains('Buddhism')]
tao_te_ching = train_data[train_data['Chapters'].str.contains('TaoTeChing')]
upanishad = train_data[train_data['Chapters'].str.contains('Upanishad')]
yoga_sutra = train_data[train_data['Chapters'].str.contains('YogaSutra')]
book_proverb = train_data[train_data['Chapters'].str.contains('Proverb')]
book_of_ecclesiastes = train_data[train_data['Chapters'].str.contains('Ecclesiastes')]
book_of_eccleasiasticus = train_data[train_data['Chapters'].str.contains('Eccleasiasticus')]
book_of_wisdom = train_data[train_data['Chapters'].str.contains('Wisdom')]


In [6]:
def find_top_words(df:pd.DataFrame, count: int = 20) -> pd.DataFrame:
    """ Given data frame find the {count} most common words (column values)"""
    df = df.sum(axis=0).drop('Chapters').squeeze()
    df = pd.DataFrame({'count': df})
    return df.sort_values(by='count', ascending=False).head(count)

In [7]:
def unique_words_frame(df: pd.DataFrame) -> pd.DataFrame:
    """ Removes any columns with 0 as a sum (meaning word not actually in book)"""
    df = df.loc[:, (df != 0).any(axis=0)]
    if 'Chapters' in df.columns:
        df = df.drop(columns='Chapters')
    return df

#### Training Data Split By Book Aggregated

In [8]:
book_names = ['Buddhism', 'TaoTeChing', 'Upanishad', 'YogaSutra', 'Proverb', 'Ecclesiastes', 'Eccleasiasticus', 'Wisdom']
book_dfs = [buddhism,tao_te_ching, upanishad, yoga_sutra, book_proverb, book_of_ecclesiastes, book_of_eccleasiasticus, book_of_wisdom]
book_dfs = [unique_words_frame(df) for df in book_dfs]
dataframes = dict(zip(book_names, book_dfs))

In [9]:
pat = '|'.join(book_names)
by_book = train_data['Chapters'].str.extract('(' + pat + ')', expand = False)
books = train_data.groupby(by_book).sum().reset_index().rename(columns={'Chapters':'Books'}).set_index('Books')
books.head(5)

Unnamed: 0_level_0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
Books,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Buddhism,0,0,0,0,19,0,0,0,0,0,...,0,0,0,0,0,0,9,0,0,0
Eccleasiasticus,0,189,3,1,0,14,0,0,3,0,...,0,0,0,1,1,0,7,0,2,0
Ecclesiastes,0,46,0,0,0,5,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Proverb,2,65,0,0,0,11,0,0,0,0,...,0,0,0,0,0,0,8,0,1,0
TaoTeChing,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def get_text_blob(df:pd.DataFrame) -> TextBlob:
    """ Given a dataframe, creates a text blob of its columns"""
    df = df.loc[:, (df != 0).any(axis=0)]
    return TextBlob(" ".join(df.columns))

In [11]:
def get_word_pos_pairs(df:pd.DataFrame) -> list:
    """ Given dataframe, returns all words (columns) and associated part of speech"""
    #all_words = TextBlob(" ".join(df.columns))
    return get_text_blob(df).tags
   # return all_words.tags

In [12]:
def get_pos_tag_list(df:pd.DataFrame) -> list:
    """ Returns list of all unique parts of speech found in all words (columns) of a dataframe"""
    word_tags = get_word_pos_pairs(df)
    return list(set([tag[1] for tag in word_tags]))

In [13]:
def get_data_for_pos(df:pd.DataFrame, pos:str) -> pd.DataFrame:
    """Given a dataframe and part of speech, subsets dataframe to only include columns matching the pos"""
    word_tags = get_word_pos_pairs(df)
    chosen_pos = [tag[0] for tag in word_tags if tag[1] == pos]
    return df.loc[:,df.columns.isin(chosen_pos)]

In [14]:
get_data_for_pos(books, 'VB') # Example run retreiving all basic verbs

Unnamed: 0_level_0,build,coward,handle,think,keep,inculcate,disclose,feed,balance,rehearse,...,influence,gaineth,tone,tank,dathan,impose,realms,try,godlike,roar
Books,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Buddhism,0,0,0,6,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Eccleasiasticus,3,1,0,7,27,0,5,2,3,1,...,0,0,0,0,1,0,0,4,0,1
Ecclesiastes,1,0,0,1,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Proverb,1,0,0,5,24,0,0,0,2,0,...,0,1,0,0,0,0,0,0,0,0
TaoTeChing,0,0,0,4,16,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
Upanishad,0,0,1,11,1,0,0,0,0,0,...,1,0,0,0,0,0,5,2,1,0
Wisdom,1,0,1,4,5,0,0,1,1,0,...,0,0,0,0,0,0,0,2,0,0
YogaSutra,5,0,2,7,4,0,0,0,1,0,...,2,0,1,1,0,1,4,4,0,0


### Crude TF-IDF Analysis

In [15]:
book_blobs = [get_text_blob(book) for book in dataframes.values()]

In [41]:
def tf(word:str, df: pd.DataFrame) -> float:
    """ Generate term refrequency of a word in a dataframe's columns"""
    return df.get(word).sum() /df.shape[1]

In [42]:
def n_containing(word: str, blobs: list) -> int:
    """Return number of books in list containing specific word"""
    return 1 + sum(1 for blob in blobs if word in blob)

In [43]:
def idf(word: str, blobs: list):
    """ Calculate inverse doc frequency of word in list of documents"""
    x = n_containing(word, blobs)
    return  1 +(math.log(len(blobs) / (x + 1)))

In [44]:
def tf_idf(word: str, df: pd.DataFrame, blobs: list = book_blobs) -> float:
    """ Calculate tf idf for word within a dataframe"""
    return tf(word, df) * idf(word, blobs)

In [45]:
def find_top_tf_idf(df: pd.DataFrame, count: int = 20) -> pd.DataFrame:
    """ Given data frame find the {count} words (column values) with the highest tf idf values"""
    df = unique_words_frame(df)
    df_tf_idfs = {word: tf_idf(word, df) for word in df.columns}
    to_dict = dict(sorted(df_tf_idfs.items(), key = lambda item: item[1], reverse = True))
    return pd.Series(to_dict).rename('tf_idf').to_frame().head(count)

In [46]:
def in_both(df: pd.DataFrame, amount: int = 20):
    """ Given a dataframe finds any words in common between
        the top {amount} of words by tf idf and sheer count"""
    tf = find_top_tf_idf(df, amount)
    count = find_top_words(df, amount)
    return tf[tf.index.isin(count.index)]


In [22]:
def only_in_count(df:pd.DataFrame, amount: int = 20) -> pd.DataFrame:
    """ Given a dataframe, finds top amount of words
    by tf idf and sheer count and returns the words
    present in top count but not tf idf """
    tf = find_top_tf_idf(df)
    count = find_top_words(df)
    return count[count.index.isin(tf.index) == False]


In [23]:
def only_in_tf_idf(df:pd.DataFrame, amount: int = 20) -> pd.DataFrame:
    """ Given a dataframe, finds top amount of words
    by tf idf and sheer count and returns the words
    present in top tf idf but not count"""
    
    tf = find_top_tf_idf(df, amount)
    count = find_top_words(df, amount)
    return tf[tf.index.isin(count.index[:amount]) == False]

In [24]:
def tf_idf_comparison(df_1: pd.DataFrame, df_2: pd.DataFrame, amount: int = 20) -> pd.DataFrame:
    """Given two dataframes, finds top {amount} of words by tf idf and compares the two.
       Returns a tuple of three values:
               - first = in both
               - second = only in first df
               - third = only in second df
    """
    tf_1 = find_top_tf_idf(df_1)
    tf_2 = find_top_tf_idf(df_2)
    return (tf_1[tf_1.index.isin(tf_2.index[:amount])],
            tf_1[tf_1.index.isin(tf_2.index[:amount]) == False],
            tf_2[tf_2.index.isin(tf_1.index[:amount]) == False]
           )

In [25]:
def top_words_comparison(df_1: pd.DataFrame, df_2: pd.DataFrame, amount: int = 20) -> pd.DataFrame:
    """Given two dataframes, finds top {amount} of words by count and compares the two.
       Returns a tuple of three values:
               - first = in both
               - second = only in first df
               - third = only in second df
    """
    words_1 = find_top_words(df_1)
    words_2 = find_top_words(df_2)
    return (words_1[words_1.index.isin(words_2.index[:amount])],
            words_1[words_1.index.isin(words_2.index[:amount]) == False],
            words_2[words_2.index.isin(words_1.index[:amount]) == False]
           )