In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sweetviz as sv
import seaborn as sns
from textblob import TextBlob
import math
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\book2\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\book2\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [27]:
test_data = pd.read_csv('Datasets\Religious_text_test.csv')
test_data = test_data.rename(columns={'# foolishness': 'foolishness'})
test_data.head(5)

Unnamed: 0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
train_data = pd.read_csv('Datasets\Religious_text_train.csv')
train_data = train_data.rename(columns={'Unnamed: 0': 'Chapters'})
train_data.head(5)

Unnamed: 0,Chapters,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
0,Buddhism_Ch1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Buddhism_Ch2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Buddhism_Ch3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Buddhism_Ch4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Buddhism_Ch5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Training Data Split By Book (with Columns Maintained)

In [29]:
buddhism = train_data[train_data['Chapters'].str.contains('Buddhism')]
tao_te_ching = train_data[train_data['Chapters'].str.contains('TaoTeChing')]
upanishad = train_data[train_data['Chapters'].str.contains('Upanishad')]
yoga_sutra = train_data[train_data['Chapters'].str.contains('YogaSutra')]
book_proverb = train_data[train_data['Chapters'].str.contains('Proverb')]
book_of_ecclesiastes = train_data[train_data['Chapters'].str.contains('Ecclesiastes')]
book_of_eccleasiasticus = train_data[train_data['Chapters'].str.contains('Eccleasiasticus')]
book_of_wisdom = train_data[train_data['Chapters'].str.contains('Wisdom')]


In [30]:
def find_top_words(df:pd.DataFrame, count: int) -> pd.DataFrame:
    """ Given data frame find the {count} most common words (column values)"""
    df = df.sum(axis=0).drop('Chapters').squeeze()
    df = pd.DataFrame({'count': df})
    return df.sort_values(by='count', ascending=False).head(count)

#### Training Data Split By Book Aggregated

In [31]:
book_names = ['Buddhism', 'TaoTeChing', 'Upanishad', 'YogaSutra', 'Proverb', 'Ecclesiastes', 'Eccleasiasticus', 'Wisdom']
book_dfs = [buddhism,tao_te_ching, upanishad, yoga_sutra, book_proverb, book_of_ecclesiastes, book_of_eccleasiasticus, book_of_wisdom]
dataframes = dict(zip(book_names, book_dfs))

In [32]:
pat = '|'.join(book_names)
by_book = train_data['Chapters'].str.extract('(' + pat + ')', expand = False)
books = train_data.groupby(by_book).sum().reset_index().rename(columns={'Chapters':'Books'}).set_index('Books')
books.head(5)

Unnamed: 0_level_0,foolishness,hath,wholesome,takest,feelings,anger,vaivaswata,matrix,kindled,convict,...,erred,thinkest,modern,reigned,sparingly,visual,thoughts,illumines,attire,explains
Books,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Buddhism,0,0,0,0,19,0,0,0,0,0,...,0,0,0,0,0,0,9,0,0,0
Eccleasiasticus,0,189,3,1,0,14,0,0,3,0,...,0,0,0,1,1,0,7,0,2,0
Ecclesiastes,0,46,0,0,0,5,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Proverb,2,65,0,0,0,11,0,0,0,0,...,0,0,0,0,0,0,8,0,1,0
TaoTeChing,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
def get_text_blob(df:pd.DataFrame) -> TextBlob:
    """ Given a dataframe, creates a text blob of its columns"""
    df = df.loc[:, (df != 0).any(axis=0)]
    return TextBlob(" ".join(df.columns))

In [34]:
def get_word_pos_pairs(df:pd.DataFrame) -> list:
    """ Given dataframe, returns all words (columns) and associated part of speech"""
    #all_words = TextBlob(" ".join(df.columns))
    return get_text_blob(df).tags
   # return all_words.tags

In [35]:
def get_pos_tag_list(df:pd.DataFrame) -> list:
    """ Returns list of all unique parts of speech found in all words (columns) of a dataframe"""
    word_tags = get_word_pos_pairs(df)
    return list(set([tag[1] for tag in word_tags]))

In [36]:
def get_data_for_pos(df:pd.DataFrame, pos:str) -> pd.DataFrame:
    """Given a dataframe and part of speech, subsets dataframe to only include columns matching the pos"""
    word_tags = get_word_pos_pairs(df)
    chosen_pos = [tag[0] for tag in word_tags if tag[1] == pos]
    return df.loc[:,df.columns.isin(chosen_pos)]

In [37]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\book2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
get_data_for_pos(books, 'VB') # Example run retreiving all basic verbs

Unnamed: 0_level_0,build,coward,handle,think,keep,inculcate,disclose,feed,balance,rehearse,...,influence,gaineth,tone,tank,dathan,impose,realms,try,godlike,roar
Books,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Buddhism,0,0,0,6,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Eccleasiasticus,3,1,0,7,27,0,5,2,3,1,...,0,0,0,0,1,0,0,4,0,1
Ecclesiastes,1,0,0,1,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Proverb,1,0,0,5,24,0,0,0,2,0,...,0,1,0,0,0,0,0,0,0,0
TaoTeChing,0,0,0,4,16,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
Upanishad,0,0,1,11,1,0,0,0,0,0,...,1,0,0,0,0,0,5,2,1,0
Wisdom,1,0,1,4,5,0,0,1,1,0,...,0,0,0,0,0,0,0,2,0,0
YogaSutra,5,0,2,7,4,0,0,0,1,0,...,2,0,1,1,0,1,4,4,0,0


### Crude TF-IDF Analysis

In [39]:
# Generate text blobs for each book (text blob being a string of all present words in book)
book_blobs = [get_text_blob(book) for book in dataframes.values()]

In [40]:
def tf(word:str, df: pd.DataFrame) -> float:
    """ Generate term refrequency of a word in a dataframe's columns"""
    blob = get_text_blob(df)
    return df.get('spiritual').sum() /len(blob.words)

In [41]:
def n_containing(word: str, blobs: list) -> int:
    """Return number of books in list containing specific word"""
    return sum(1 for blob in blobs if word in blob)

In [42]:
def idf(word: str, blobs: list):
    """ Calculate inverse doc frequency of word in list of documents"""
    x = n_containing(word, blobs)
    return math.log(len(blobs) / (x if x else 1))

In [43]:
def tf_idf(word: str, df: pd.DataFrame, blobs = book_blobs) -> float:
    return tf(word, df) * idf(word, blobs)

In [44]:
tf_idf('shall', yoga_sutra)

0.012202076694290822

 Possible follow up: Naive Baise or Random Forest 
 Source: https://medium.com/@tenzin_ngodup/simple-text-classification-using-random-forest-fe230be1e857 

In [103]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets 


In [129]:
def get_string_lst(df):
    return df.multiply(df.columns+' ', axis=1).sum(axis=1)

In [None]:
def rando_forest(df): 
    corpus = df.groupby(by_book).sum().reset_index().rename(columns={'Chapters':'Books'}).set_index('Books')
    Y = df.columns 

In [138]:
corpus = get_string_lst(books)
Y = books.T.columns
corpus

Books
Buddhism           feelings feelings feelings feelings feelings f...
Eccleasiasticus    hath hath hath hath hath hath hath hath hath h...
Ecclesiastes       hath hath hath hath hath hath hath hath hath h...
Proverb            foolishness foolishness hath hath hath hath ha...
TaoTeChing         anger open open rage nursingmother tell neithe...
Upanishad          anger anger anger vaivaswata kindled rushed ru...
Wisdom             hath hath hath hath hath hath hath hath hath h...
YogaSutra          hath hath wholesome matrix revelation revelati...
dtype: object

In [143]:
Y = np.arange(len(Y))

In [144]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
X

array([[0, 2, 5, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 1, 2],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [145]:
clf = RandomForestClassifier()
clf.fit(X, Y)

RandomForestClassifier()

In [146]:
test = get_string_lst(test_data)
test


0      open mentally mentally i affliction affliction...
1      staying origination origination disenchantment...
2      tell thus thus play play play play play short ...
3      illumination origination origination originati...
4      stressfulness stressfulness stressfulness stre...
                             ...                        
585    hath hath neither neither soft handle short be...
586    rage neither neither land land remedy came cam...
587    neither neither great great overtaken yielding...
588    diadem neither land guest great came fears ask...
589    lambs neither land land land great great came ...
Length: 590, dtype: object

In [147]:
clf.predict(vectorizer.transform(test.iloc[:10]).toarray())

array([0, 0, 0, 2, 2, 0, 2, 2, 0, 2])