In [1]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import pandas as pd
import datetime, os
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Test reading file.
import os
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
len(df_train)

229472

### Try Mission Statements - MultinomialNB - LemmaTokenizer - Count.

In [None]:
df_mission_MNB_lemma_count=pd.DataFrame(columns=['accuracy', 'precision', 'recall'])
for trial in tqdm(range(0, 100)):
    
    ##########################################################
    ################ Prepare dataframe for ML ################
    #### Sample ####
    small_num=0
    while small_num<100: # Make sure each category has at least 100 records.
        trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(100000)
        small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
    #### Sample ####
    trainDF['text'] = trainDF['mission'].astype(str)
    trainDF['label'] = trainDF['NTEE1'].astype(str)
    # split the dataset into training and validation datasets 
    x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
    ################ Prepare dataframe for ML ################
    ##########################################################

    ##########################################################
    ################ Define tokenizer ################
    # Source: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/
    # Use NLTK's PorterStemmer
    def stemming_tokenizer(str_input):
        tokens = word_tokenize(str_input)
        return [PorterStemmer().stem(token) for token in tokens]

    # Source: https://scikit-learn.org/stable/modules/feature_extraction.html
    # Use NLTK's Lemmatizer
    class LemmaTokenizer(object):
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
             return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
    ################ Define tokenizer ################
    ##########################################################

    ##########################################################
    ######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.

    ##### Token counts #####
    # create the transform
    count_vect = CountVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    count_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_count =  count_vect.transform(x_train)
    x_valid_vect_count =  count_vect.transform(x_valid)
    ##### Token counts #####

    ##### TF-IDF #####
    # create the transform
    tfidf_vect = TfidfVectorizer(stop_words='english', 
                                 tokenizer=LemmaTokenizer(), 
                                 analyzer='word'
                                )
    # tokenize and build vocab
    tfidf_vect.fit(trainDF['text'])
    # Encode document: transform the training and validation data using count vectorizer object
    x_train_vect_tfidf =  tfidf_vect.transform(x_train)
    x_valid_vect_tfidf =  tfidf_vect.transform(x_valid)
    ##### TF-IDF #####

    ######### Text Vectorization and Transformation ##########
    ##########################################################

    def func_performance(classifier, x_train, y_train, x_valid, y_valid):
        # fit the training dataset on the classifier
        classifier.fit(x_train, y_train)
        # predict the labels on validation dataset
        predictions = classifier.predict(x_valid)
        return [metrics.accuracy_score(predictions, y_valid), 
                metrics.precision_score(predictions, y_valid, 
                                        average='macro', # Use unweighted mean.
                                       ),
                metrics.recall_score(predictions, y_valid, 
                                     average='macro',  # Use unweighted mean.
                                    )]

    performance_result=func_performance(classifier=naive_bayes.MultinomialNB(), 
                                        x_train=x_train_vect_count,
                                        y_train= y_train, 
                                        x_valid=x_valid_vect_count,
                                        y_valid=y_valid
                                       )
    df_mission_MNB_lemma.loc[trial]=performance_result

  0%|          | 0/100 [00:00<?, ?it/s]

In [27]:
# Naive Bayes on Word Level TF IDF Vectors
# Naive Bayes on Count Vectors
accuracy = train_model(classifier=naive_bayes.MultinomialNB(), 
                       x_train=x_train_vect_tfidf,
                       y_train= y_train, 
                       x_valid=x_valid_vect_tfidf,
                       y_valid=y_valid
                      )
results.loc[len(results)] = ["NB, Count Vectors", accuracy[0], accuracy[1], accuracy[2], accuracy[3]]
print("NB, TF IDF Vectors: ", accuracy)


# Raw: MNB, Count Vectors:  [0.6732666666666667, 0.4503852200520778, 0.6661678975327818, datetime.timedelta(microseconds=489443)]
# Raw: CNB, Count Vectors:  [0.7013333333333334, 0.5275971009306734, 0.6170202787685387, datetime.timedelta(microseconds=521657)]
# Stemmed: MNB, Count Vectors:  [0.6601, 0.4243013182900721, 0.639930088219624, datetime.timedelta(microseconds=443571)]
# Stemmed: CNB, Count Vectors:  [0.7, 0.5320034132229355, 0.6383184621220114, datetime.timedelta(microseconds=504788)]
# Lemma: MNB, Count Vectors:  [0.6615, 0.42366552445612, 0.6783346552845068, datetime.timedelta(microseconds=572416)]
# Lemma: CNB, Count Vectors:  [0.7021333333333334, 0.5341465196920103, 0.6318440519847688, datetime.timedelta(microseconds=586597)]

# Raw: MNB, TF IDF Vectors:  [0.5859, 0.32327746269021723, 0.5485416630089535, datetime.timedelta(microseconds=536424)]
# Raw: CNB, TF IDF Vectors:  [0.6992333333333334, 0.5306246321804787, 0.6136847132057796, datetime.timedelta(microseconds=580513)]
# Stemmed: MNB, TF IDF Vectors:  [0.553, 0.2949552239378839, 0.536824161513825, datetime.timedelta(microseconds=453431)]
# Stemmed: CNB, TF IDF Vectors:  [0.6967666666666666, 0.5349583083797411, 0.6223488506468897, datetime.timedelta(microseconds=520307)]
# Lemma: MNB, TF IDF Vectors:  [0.5589666666666666, 0.297005157029138, 0.5293106710625075, datetime.timedelta(microseconds=529497)]
# Lemma: CNB, TF IDF Vectors:  [0.7012333333333334, 0.536867845262306, 0.6273414839488708, datetime.timedelta(microseconds=546401)]

NB, TF IDF Vectors:  [0.5583666666666667, 0.29982304712090974, 0.5961215483457072, datetime.timedelta(microseconds=498992)]


  'recall', 'true', average, warn_for)


**Looks like `Lemma-CNB-Count` produces best results.**