In [28]:
import pandas as pd
import numpy as np
from tokenizer_xm import *
from nltk import pos_tag
import re
from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [2]:
bb = pd.read_csv("data/unqiue_bb_data.csv",encoding = "ISO-8859-1")

In [22]:
class get_noun_adj:
    """
    Get the nouns and Adjectives used in a collection of text. 
    
    ---return
    Returns a class that contains:
    1. A pandas dataframe with all unique nouns with DF and TF
    2. A pandas dataframe with all unique adjectives with DF and TF
    """
    def __init__(self, text, stopwords = []):
        self.text =text
        self.stopwords = stopwords

    
    def flatten(self,listOfLists):
        from itertools import chain
        "Flatten one level of nesting"
        return list(chain.from_iterable(listOfLists))
    
    def get_Nouns_Adjs(self, show_top_n = 15):
        
        """
        ---Parameter
        show_top_n: int, Show top n popular adjs and nouns        
        """
        
        # simple_process the documents
        simple_processed_doc = text_tokenizer_xm(text = self.text, lemma_flag = True,stem_flag = False,\
                                  stopwords=[]).txt_pre_pros_all()
                
        # Tag each word within a list of tokenized documents
        tagged_doc = pd.Series(simple_processed_doc).apply(lambda x: pos_tag(x))

        # Extract all the tags
        tags = tagged_doc.apply(lambda x: [tup[1] for tup in x])
        
        # Remove the stopwords
        tagged_doc_cleaned = tagged_doc.apply(lambda x: [tup for tup in x if tup[0] not in self.stopwords])
        
        # Create a separate list where each term + tag combination only appears once in each tagged document.
        # This list is to calculate document frequency
        tagged_doc_unique = [list(set(x)) for x in tagged_doc_cleaned]
                
        # Get all the tuples
        all_tups = self.flatten(tagged_doc_unique)
        
        # Get all the terms for term-frequency
        all_tups_tf = self.flatten(tagged_doc_cleaned)
        all_tups_term = np.array([tup[0] for tup in all_tups_tf])        

        
        # Find all the Nouns
        idx_n = [(tup[1] in ['NN','NNS','NNP','NNPS']) for tup in all_tups]
        all_nouns_raw = (pd.Series([tup[0] for tup in all_tups])[idx_n])
        # Lemmatize to merge terms like rashes, babies to rash, baby.
        all_nouns = self.flatten(list(text_tokenizer_xm(text = all_nouns_raw,stem_flag=False, \
                                                         lemma_flag=True,stopwords=[]).txt_pre_pros_all()))

        # Find all the Adjs
        idx_a = [(tup[1] in ['JJ','JJS','JJR']) for tup in all_tups]
        all_adjs_raw = (pd.Series([tup[0] for tup in all_tups])[idx_a])
        all_adjs = self.flatten(list(text_tokenizer_xm(text = all_adjs_raw,stem_flag=False, \
                                                         lemma_flag=True,stopwords=[]).txt_pre_pros_all()))
        # Construct the Noun Table
        all_noun = pd.DataFrame({'Nouns':all_nouns})
        all_noun_agg = pd.DataFrame(all_noun.groupby('Nouns').size().\
                                    sort_values(ascending = False)).reset_index().head(show_top_n)
        all_noun_agg.columns = ['Terms','Document_Frequency']

        # Constuct the Adj Table
        all_adj = pd.DataFrame({'Adjs':all_adjs})
        all_adj_agg = pd.DataFrame(all_adj.groupby('Adjs').size().\
                                   sort_values(ascending = False)).reset_index().head(show_top_n)
        all_adj_agg.columns = ['Terms','Document_Frequency']
        
        # Here, the document frequency is not accurate. Some tokens, like "product" can be tagged as 
        # NN or NNS in the simple preprocess documents where we don't lemmatize. Therefore, the term will
        # Get over counted in the document frequency calculation above. Unless we lemmatize the term in
        # simple preprocess

        class output:
            all_adj  = all_adj_agg
            all_noun = all_noun_agg
            tagged_terms = tagged_doc_cleaned            
            
        return output

## testing

interesting knowledge:https://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given

In [23]:
text = list(bb['review_text'].astype(str).unique())

In [24]:
na = get_noun_adj(text = text)

In [25]:
res = na.get_Nouns_Adjs(show_top_n=15)

how about, simply collect the terms and the processed documents. Then utilize keras to get the tfidf, tf and df scores?

This doesn't work. The process is way too tedious just for a tfidf calculation.

Also, since keras' tfidf vectorizer is implemented separately, it seems sometimes the two program will have "disagreements." e.g. we found top 20 adjs with the original program, but some of the 20 adjs is not found by the keras tfidf vectorizer, which could cause huge confusion 

Useful link to help recall the definitions of TFIDF.:
http://www.tfidf.com/

An interesting note is: Term-frequency is not the same as the count of Terms. Term-Frequency, in the TFIDF setting, is the number of times a term appears **in each document**, and it is often weighted by the total number of terms in the document. Term count, which is merely the total number of occurance of a term, can be understood as the sum of TF (not weighted)

Below is a trial I did to add an accurate find_match and idf, tf, df calculation. But it significantly slows down the calculation and the find_match function seem to be very vulnerable to the quality of text. Thus I removed the chunck and save it incase we need to use them in the future

In [None]:
adj_df = []
adj_df_idx = []
adj_term_count = []
n_df = []
n_df_idx = []
n_term_count = []

# Loop through the adjs
for term in all_adj_agg['Terms']:
    tf = np.array([find_match(term,document,lemma = True, stem = False,unigram_return_count = True)\
                   for document in self.text])
    # Calcualte the Document Frequency and Inverse Document Frequency
    df = sum(tf != 0)
    adj_df.append(df)
    adj_term_count.append(sum(tf))
    adj_df_idx.append(list(tf != 0))

# Loop through the nouns
for term in all_noun_agg['Terms']:
    tf = np.array([find_match(term,document,lemma = True, stem = False,unigram_return_count = True)\
                   for document in self.text])
    # Calcualte the Document Frequency and Inverse Document Frequency
    df = sum(tf != 0)
    n_df.append(df)
    n_term_count.append(sum(tf))
    n_df_idx.append(list(tf != 0))

# Create columns
all_noun_agg['Document_Frequency'] = n_df
all_noun_agg['Total_Occurance'] = n_term_count

all_adj_agg['Document_Frequency'] = adj_df
all_adj_agg['Total_Occurance'] = adj_term_count

self.adj_df_idx = adj_df_idx
self.n_df_idx = n_df_idx
