In [1]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool

In [2]:
# Code as 10 broad categories.
broad_cat_dict={'I': ['A'],
                  'II': ['B'],
                  'III': ['C', 'D'],
                  'IV': ['E', 'F', 'G', 'H'],
                  'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                  'VI': ['Q'],
                  'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                  'VIII': ['X'],
                  'IX': ['Y'],
                  'X': ['Z'],
                 }
def ntee2cat(string):
    global broad_cat_dict
    return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]

In [3]:
train_file_path='../../dataset/df_ntee_universal/train/'
file_list=os.listdir(train_file_path)
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, pd.read_pickle(train_file_path+file, compression='gzip')])
len(df_train)

154424

In [4]:
df_train['mission_prgrm']=df_train['mission']+' '+df_train['prgrm_dsc']
df_train['mission_prgrm_spellchk']=df_train['mission_spellchk']+' '+df_train['prgrm_dsc_spellchk'] # Using spell-checked.
df_train['broad_cat']=df_train['NTEE1'].apply(ntee2cat)
len(df_train['mission_prgrm_spellchk']), len(df_train['NTEE1'].drop_duplicates()), len(df_train['broad_cat'].drop_duplicates())

(154424, 25, 9)

In [5]:
# Check if the sampling criteria can be satisfied.
small_num=0
while small_num<500: # Make sure each category in training dataset has at least 500 records.
    trainDF, valDF = model_selection.train_test_split(df_train, test_size=.2)
    small_num=trainDF.groupby('broad_cat').count().sort_values('EIN').iloc[0]['EIN']

# See the composition by broad category.
print(trainDF.groupby('broad_cat').count()['EIN'], '\n'*2, valDF.groupby('broad_cat').count()['EIN'])

broad_cat
I       13600
II      20743
III      6113
IV      13486
IX       5272
V       37344
VI       1590
VII     21768
VIII     3623
Name: EIN, dtype: int64 

 broad_cat
I       3410
II      5084
III     1449
IV      3350
IX      1368
V       9390
VI       397
VII     5494
VIII     943
Name: EIN, dtype: int64


### Parrallel computing.

In [6]:
def func_naive_bayes(param_list):
    global trainDF, valDF
    input_text, classifier, tokenizer, vect_type, average_mtd = param_list
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    # Build training and testing data frame.
    x_train=trainDF[input_text]
    y_train=trainDF['broad_cat']
    x_valid=valDF[input_text]
    y_valid=valDF['broad_cat']
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################

    def porter_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]
    
    # Lemmatize using POS tags, assume to improve accuracy.
    # Ref: 
    #   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    #   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemma_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens=word_tokenize(str_input)
        return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(tokens)]
            
    if tokenizer=='lemma':
        tokenizer=lemma_tokenizer
    elif tokenizer=='porter':
        tokenizer=porter_tokenizer
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab.
        vectorizer.fit(x_train)
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(x_train)
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    return {'input_text':input_text,
            'classifier':str(classifier), 
            'tokenizer':tokenizer.__name__, 
            'vect_type':vect_type, 
            'average_mtd':average_mtd,
            'accuracy':metrics.accuracy_score(predictions, y_valid), 
            'precision':metrics.precision_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
            'recall':metrics.recall_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
            'f1':metrics.f1_score(y_pred=predictions, y_true=y_valid, average=average_mtd)
           }

### Naive Bayes

In [7]:
# Generate a list of parameters.
param_llist=[]
for input_text in ['mission', 'prgrm_dsc', 'mission_prgrm', 'mission_spellchk', 'prgrm_dsc_spellchk', 'mission_prgrm_spellchk']:
    for classifier in [naive_bayes.MultinomialNB(), naive_bayes.ComplementNB()]:
        for tokenizer in ['lemma', 'porter']:
            for vect_type in ['count', 'tfidf']:
                for average_mtd in ['macro', 'weighted']:
                    param_llist+=[[input_text, classifier, tokenizer, vect_type, average_mtd]]

In [8]:
p=Pool(24)
df_performance_nb=pd.DataFrame(p.map(func_naive_bayes, param_llist))

In [15]:
df_performance_nb.sort_values('accuracy', ascending=False)[0:5]

Unnamed: 0,accuracy,average_mtd,classifier,f1,input_text,precision,recall,tokenizer,vect_type
91,0.778145,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.773984,mission_prgrm_spellchk,0.774852,0.778145,lemma_tokenizer,tfidf
90,0.778145,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.699526,mission_prgrm_spellchk,0.74274,0.685534,lemma_tokenizer,tfidf
42,0.777497,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.695233,mission_prgrm,0.750631,0.676116,lemma_tokenizer,tfidf
43,0.777497,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.772665,mission_prgrm,0.775101,0.777497,lemma_tokenizer,tfidf
95,0.775522,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.771589,mission_prgrm_spellchk,0.772726,0.775522,porter_tokenizer,tfidf


```Python
# Manually check if acc is correct.
In :func_naive_bayes(2)
    df_performance['accuracy']
Out:0    0.340417
    Name: accuracy, dtype: float64
In :t=pd.DataFrame([classifier.predict(x_valid_vect), y_valid]).T.rename(columns={0:'a', 1:'b'})
    len(t[t.a==t.b])/len(t)
Out:0.34041666666666665
''' Looks correct, scale the computing '''
```

### Random forest.

In [16]:
# Generate a list of parameters.
param_llist=[]
for input_text in ['mission', 'prgrm_dsc', 'mission_prgrm', 'mission_spellchk', 'prgrm_dsc_spellchk', 'mission_prgrm_spellchk']:
    for classifier in [ensemble.RandomForestClassifier()]:
        for tokenizer in ['lemma', 'porter']:
            for vect_type in ['count', 'tfidf']:
                for average_mtd in ['macro', 'weighted']:
                    param_llist+=[[input_text, classifier, tokenizer, vect_type, average_mtd]]

In [17]:
p=Pool(24)
df_performance_rf=pd.DataFrame(p.map(func_naive_bayes, param_llist))

In [29]:
df_performance=pd.concat([df_performance_rf, df_performance_nb], ignore_index=True).sort_values('accuracy', ascending=False)
df_performance[0:10]

Unnamed: 0,accuracy,average_mtd,classifier,f1,input_text,precision,recall,tokenizer,vect_type
139,0.778145,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.773984,mission_prgrm_spellchk,0.774852,0.778145,lemma_tokenizer,tfidf
138,0.778145,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.699526,mission_prgrm_spellchk,0.74274,0.685534,lemma_tokenizer,tfidf
90,0.777497,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.695233,mission_prgrm,0.750631,0.676116,lemma_tokenizer,tfidf
91,0.777497,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.772665,mission_prgrm,0.775101,0.777497,lemma_tokenizer,tfidf
143,0.775522,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.771589,mission_prgrm_spellchk,0.772726,0.775522,porter_tokenizer,tfidf
142,0.775522,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.69987,mission_prgrm_spellchk,0.744213,0.684626,porter_tokenizer,tfidf
94,0.773709,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.692758,mission_prgrm,0.746558,0.67312,porter_tokenizer,tfidf
95,0.773709,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.769014,mission_prgrm,0.771355,0.773709,porter_tokenizer,tfidf
88,0.768626,macro,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.691887,mission_prgrm,0.729649,0.682032,lemma_tokenizer,count
89,0.768626,weighted,"ComplementNB(alpha=1.0, class_prior=None, fit_...",0.76482,mission_prgrm,0.766744,0.768626,lemma_tokenizer,count


### Test on Universal Testing Dataset