In [1]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
import nltk
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
from multiprocessing import Pool

In [2]:
train_file_path='../../dataset/df_ntee_universal/train/'
file_list=os.listdir(train_file_path)
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, pd.read_pickle(train_file_path+file, compression='gzip')])
len(df_train)

154424

In [3]:
df_train['mission_prgrm']=df_train['mission']+' '+df_train['prgrm_dsc']
df_train['mission_prgrm_spellchk']=df_train['mission_spellchk']+' '+df_train['prgrm_dsc_spellchk'] # Using spell-checked.
len(df_train['mission_prgrm_spellchk']), len(df_train['NTEE1'].drop_duplicates())

(154424, 25)

In [4]:
# Check if the sampling criteria can be satisfied.
small_num=0
while small_num<100: # Make sure each category in training dataset has at least 100 records.
    trainDF, valDF = model_selection.train_test_split(df_train, test_size=.2)
    small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']

# See the composition by broad category.
print(trainDF.groupby('NTEE1').count()['EIN'], '\n'*2, valDF.groupby('NTEE1').count()['EIN'])

NTEE1
A    13657
B    20728
C     2655
D     3379
E     7152
F     1832
G     4010
H      364
I     2372
J     3841
K     1610
L     4763
M     3704
N    12266
O     1377
P     7292
Q     1592
R      858
S    11659
T     1645
U      795
V      282
W     6687
X     3650
Y     5369
Name: EIN, dtype: int64 

 NTEE1
A    3353
B    5099
C     668
D     860
E    1863
F     469
G    1043
H     103
I     575
J     931
K     399
L    1179
M     989
N    3194
O     354
P    1888
Q     395
R     206
S    2800
T     387
U     205
V      68
W    1670
X     916
Y    1271
Name: EIN, dtype: int64


### Prepare parrallel envionment.

In [5]:
def func_naive_bayes(param_list):
    global trainDF, valDF
    input_text, classifier, tokenizer, vect_type, average_mtd = param_list

    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    # Build training and testing data frame.
    x_train=trainDF[input_text]
    y_train=trainDF['NTEE1']
    x_valid=valDF[input_text]
    y_valid=valDF['NTEE1']
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################

    def porter_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]
    
    # Lemmatize using POS tags, assume to improve accuracy.
    # Ref: 
    #   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    #   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemma_tokenizer(str_input): # '''Pay attention to the input: this is string input, not token!'''
        tokens=word_tokenize(str_input)
        return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(tokens)]
            
    if tokenizer=='lemma':
        tokenizer=lemma_tokenizer
    elif tokenizer=='porter':
        tokenizer=porter_tokenizer
    ################ Define tokenizer ################
    ##########################################################
    
    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    
    if vect_type=='count':
        ##### Token counts #####
        # create the transform
        vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab.
        vectorizer.fit(x_train)
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)
    elif vect_type=='tfidf':
        ##### TF-IDF #####
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
        # tokenize and build vocab
        vectorizer.fit(x_train)
        # Encode document: transform the training and validation data using count vectorizer object
        x_train_vect =  vectorizer.transform(x_train)
        x_valid_vect =  vectorizer.transform(x_valid)

    ######### Text Vectorization and Transformation ##########
    ##########################################################
    
    classifier.fit(x_train_vect, y_train)
    predictions = classifier.predict(x_valid_vect)
    return {'input_text':input_text,
            'classifier':str(classifier), 
            'tokenizer':tokenizer.__name__, 
            'vect_type':vect_type, 
            'average_mtd':average_mtd,
            'accuracy':metrics.accuracy_score(predictions, y_valid), 
            'precision':metrics.precision_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
            'recall':metrics.recall_score(y_pred=predictions, y_true=y_valid, average=average_mtd),
            'f1':metrics.f1_score(y_pred=predictions, y_true=y_valid, average=average_mtd)
           }

In [6]:
# Generate a list of parameters.
param_llist=[]
for input_text in ['mission', 'prgrm_dsc', 'mission_prgrm', 'mission_spellchk', 'prgrm_dsc_spellchk', 'mission_prgrm_spellchk']:
    for classifier in [naive_bayes.MultinomialNB(), naive_bayes.ComplementNB()]:
        for tokenizer in ['lemma', 'porter']:
            for vect_type in ['count', 'tfidf']:
                for average_mtd in ['macro', 'weighted']:
                    param_llist+=[[input_text, classifier, tokenizer, vect_type, average_mtd]]

In [None]:
p=Pool(24)
df_performance=pd.DataFrame(p.map(func_naive_bayes, param_llist))

In [10]:
df_performance[df_performance.average_mtd=='macro'].groupby(['classifier', 'tokenizer', 'vect_type']).describe()[['accuracy','f1']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,f1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
classifier,tokenizer,vect_type,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",lemma_tokenizer,count,30.0,0.241269,0.002266,0.235833,0.240069,0.241569,0.242604,0.244778,30.0,0.093399,0.002126,0.089127,0.092017,0.093258,0.095179,0.097384
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",lemma_tokenizer,tfidf,30.0,0.238564,0.002922,0.232667,0.236396,0.237958,0.240625,0.245556,30.0,0.108314,0.001699,0.104967,0.107377,0.10808,0.109486,0.112052
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",porter_tokenizer,count,30.0,0.240762,0.002503,0.235306,0.239028,0.240431,0.242806,0.245556,30.0,0.092774,0.0023,0.087243,0.091331,0.093029,0.094016,0.098436
"ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)",porter_tokenizer,tfidf,30.0,0.238947,0.002754,0.233333,0.236813,0.238861,0.240194,0.245972,30.0,0.108256,0.001617,0.105023,0.107222,0.108431,0.108919,0.112142
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",lemma_tokenizer,count,30.0,0.263319,0.002883,0.257361,0.261604,0.263056,0.265132,0.269222,30.0,0.16316,0.002632,0.157605,0.161699,0.163116,0.164495,0.167805
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",lemma_tokenizer,tfidf,30.0,0.165811,0.001591,0.163,0.164382,0.165625,0.167153,0.169306,30.0,0.011404,0.000103,0.011213,0.011322,0.0114,0.011481,0.01162
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",porter_tokenizer,count,30.0,0.263534,0.002993,0.25725,0.261354,0.26375,0.265222,0.26925,30.0,0.163105,0.002602,0.158064,0.161268,0.163216,0.164914,0.168329
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)",porter_tokenizer,tfidf,30.0,0.166251,0.001728,0.161611,0.165111,0.166361,0.167458,0.168722,30.0,0.011448,0.00012,0.011165,0.011385,0.011459,0.011527,0.011642


### Random forest.

In [None]:
for classifier in [ensemble.RandomForestClassifier()]:
    for tokenizer in ['lemma', 'porter']:
        for vect_type in ['count', 'tfidf']:
            for average_mtd in ['macro', 'weighted']:
                dview['classifier']=classifier
                dview['tokenizer']=tokenizer
                dview['vect_type']=vect_type
                dview['average_mtd']=average_mtd
                t=func_naive_bayes.map(range(30))

In [12]:
df_performance=pd.concat(dview.gather('df_performance'), ignore_index=True)
df_performance.sample(10)

Unnamed: 0,trial,classifier,tokenizer,vect_type,average_mtd,accuracy,precision,recall,f1
235,9,"RandomForestClassifier(bootstrap=True, class_w...",lemma_tokenizer,tfidf,weighted,0.278,0.279592,0.278,0.255527
344,14,"ComplementNB(alpha=1.0, class_prior=None, fit_...",lemma_tokenizer,count,macro,0.235833,0.190699,0.105259,0.092207
95,3,"RandomForestClassifier(bootstrap=True, class_w...",porter_tokenizer,tfidf,weighted,0.278028,0.280232,0.278028,0.254919
403,16,"RandomForestClassifier(bootstrap=True, class_w...",lemma_tokenizer,tfidf,weighted,0.277972,0.278541,0.277972,0.254734
94,3,"RandomForestClassifier(bootstrap=True, class_w...",porter_tokenizer,tfidf,macro,0.274389,0.266772,0.147704,0.160976
132,5,"ComplementNB(alpha=1.0, class_prior=None, fit_...",porter_tokenizer,count,macro,0.237528,0.182862,0.103287,0.090467
567,23,"ComplementNB(alpha=1.0, class_prior=None, fit_...",porter_tokenizer,tfidf,weighted,0.235889,0.237583,0.235889,0.205232
531,22,"MultinomialNB(alpha=1.0, class_prior=None, fit...",lemma_tokenizer,tfidf,weighted,0.167333,0.043979,0.167333,0.04807
246,10,"MultinomialNB(alpha=1.0, class_prior=None, fit...",porter_tokenizer,tfidf,macro,0.165833,0.006634,0.04,0.01138
215,8,"RandomForestClassifier(bootstrap=True, class_w...",porter_tokenizer,tfidf,weighted,0.281472,0.285078,0.281472,0.259149
