In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
#import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
import numpy as np
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk import word_tokenize
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

Using TensorFlow backend.


In [2]:
file_list=os.listdir('../../dataset/df_train.pkl.gz/')
df_train=pd.DataFrame()
for file in file_list:
    df_train=pd.concat([df_train, 
                        pd.read_pickle('../../dataset/df_train.pkl.gz/'+file, compression='gzip')])
df_train['mission_prgrm']=df_train['mission']+'; '+df_train['prgrm_dsc']

small_num=0
while small_num<100: # Make sure each category has at least 100 records.
    trainDF = df_train[df_train.mission.notna() & df_train.NTEE1.notna()].sample(40000)
    small_num=trainDF.groupby('NTEE1').count().sort_values('EIN').iloc[0]['EIN']
#### Sample ####
trainDF['text'] = trainDF['mission_prgrm'].astype(str)
trainDF['label'] = trainDF['NTEE1'].astype(str)
# split the dataset into training and validation datasets 
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['text'], trainDF['label'],
                                                                          train_size=0.7, shuffle=True)
################ Prepare dataframe for ML ################
##########################################################

##########################################################
################ Define tokenizer ################

def porter_tokenizer(str_input):
    tokens = word_tokenize(str_input)
    return [PorterStemmer().stem(token) for token in tokens]
    
# Lemmatize using POS tags, assume to improve accuracy.
# Ref: 
#   - https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
#   - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
         return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemma_tokenizer(str_input):
    tokens=word_tokenize(str_input)
    return [WordNetLemmatizer().lemmatize(word=word, pos=get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(tokens)]
            
tokenizer=lemma_tokenizer

################ Define tokenizer ################
##########################################################
    
##########################################################
######### Text Vectorization and Transformation ##########
    # 1. Use Porter Stemmer.
    # 2. Use word level, character level does not make sense for current situation.
    # 3. Use count (freq) and tf-idf vectorizer. see: 
    # Bengfort, B., Bilbro, R., & Ojeda, T. (2018). Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning (1 edition). Beijing Boston Farnham Sebastopol Tokyo: O’Reilly Media.
    # Page: 67.
    

##### Token counts #####
# create the transform
vectorizer = CountVectorizer(stop_words='english', 
                                     tokenizer=tokenizer, 
                                     analyzer='word'
                                    )
# tokenize and build vocab
vectorizer.fit(trainDF['text'])
# Encode document: transform the training and validation data using count vectorizer object
x_train_vect =  vectorizer.transform(x_train)
x_valid_vect =  vectorizer.transform(x_valid)


  'stop_words.' % sorted(inconsistent))


In [3]:
x_train_vect = x_train_vect.toarray()
x_valid_vect = x_valid_vect.toarray()

In [4]:
x_train_vect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])