In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import metrics

#for displaying result
import csv
from IPython.display import HTML, display
import tabulate

#these two lines changes jupyter's variable display to put each variable on its own line
#that way, we can dump mulitple variables from a single code cell (without them overwriting the previous)
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"



## Data Files

In [2]:
def loadAllDataFiles():
    filenames=[]
#    filenames.append("./data/FinAid_Labeled.csv")
    filenames.append("./data/Career_Labeled.csv")
    dumpColumnTitles()
    for file in filenames:
        X_train, X_test, y_train, y_test = loadOneFile(file)
        runAllVectorizers(X_train, X_test, y_train, y_test)


def loadOneFile(filename):
    df = pd.read_csv(filename)

    X = df.question
    y = df.Intent_Number
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    return X_train, X_test, y_train, y_test


def dumpColumnTitles():
    f.write("Tokenizer, Stop Words, nGram Range, Max Doc Frequency, Min Doc Frequency, ")
    for model in models:
        f.write(model['name'] + ', ')
    f.write ('\n')

##  Vectorizers

In [3]:
def runAllVectorizers(X_train, X_test, y_train, y_test):
    i = 0
    for vect in vectorizers:
        i += 1
        print ("Running Vectorizer {} of {}".format(i, len(vectorizers)), end="\r")
        X_train_dtm, X_test_dtm = runOneVectorizer(vect,X_train, X_test, y_train, y_test)
        runAllModels(X_train_dtm, X_test_dtm, y_train, y_test)
        f.write('\n')
    print("                                                         ", end="\r")
    

def runOneVectorizer(vect,X_train, X_test, y_train, y_test):
    # learn training data vocabulary, then use it to create a document-term matrix
    X_train_dtm = vect.fit_transform(X_train)

    # transform testing data (using fitted vocabulary) into a document-term matrix
    X_test_dtm = vect.transform(X_test)

    if vect.tokenizer is None:
        f.write("None")
    else:
        name=str(vect.tokenizer)
        name=name[10:name.find('T')]
        f.write(name)
    f.write(',')
    if vect.stop_words is None:
        f.write("None")
    elif type(vect.stop_words) is frozenset:
        f.write("Custom")
    else:
        f.write(str(vect.stop_words))
    f.write(',')
    f.write(str(vect.ngram_range).replace(',','-'))
    f.write(',')
    f.write(str(vect.max_df))
    f.write(',')
    f.write(str(vect.min_df))
    f.write(',')
    
    return X_train_dtm, X_test_dtm


def duplicateVectorizer(vect):
    #create a new vectorizer that is dupe of current one in the array
    newVect = CountVectorizer()
    newVect.stop_words = vect.stop_words
    newVect.tokenizer = vect.tokenizer
    newVect.ngram_range = vect.ngram_range
    newVect.min_df = vect.min_df
    newVect.max_df = vect.max_df
    return newVect

## Models

In [4]:
def runAllModels(X_train_dtm, X_test_dtm, y_train, y_test):
    for model in models:
        runOneModel(model['model'],X_train_dtm, X_test_dtm, y_train, y_test)


def runOneModel(model,X_train_dtm, X_test_dtm, y_train, y_test):
    # train the model using X_train_dtm (timing it with an IPython "magic command")
    #%time model.fit(X_train_dtm, y_train)
    model.fit(X_train_dtm, y_train)

    # make class predictions for X_test_dtm
    y_pred_class = model.predict(X_test_dtm)

    # calculate accuracy of class predictions
    f.write(str(metrics.accuracy_score(y_test, y_pred_class)))
    f.write(',')    
    

## Stop Words

In [5]:
from sklearn.feature_extraction import text 
from string import punctuation

def customStopWords():
    #additional_stop_words = frozenset(['testthisextrastopword'])
    additional_stop_words = set(list(punctuation))
    return text.ENGLISH_STOP_WORDS.union(additional_stop_words)  #add to 'english' list

## Stemming

In [6]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text, stemmer):
    text = "".join([ch for ch in text if ch not in punctuation])  #strip out punctuation
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

def PorterTokenizer(text):
    return tokenize(text, PorterStemmer())

def LancasterTokenizer(text):
    return tokenize(text, LancasterStemmer())

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

## Initialize Vectorizers & Models, then Run

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz

#tweak the vectorizing settings here:
stopWords = customStopWords()
vectorizers=[]
vectorizers.append(CountVectorizer(tokenizer= LancasterTokenizer, stop_words= None, ngram_range=(1, 1), min_df=1, max_df=1.0))
vectorizers.append(CountVectorizer(tokenizer= LancasterTokenizer, stop_words= None, ngram_range=(1, 2), min_df=1, max_df=1.0))
vectorizers.append(CountVectorizer(tokenizer= LancasterTokenizer, stop_words= None, ngram_range=(1, 3), min_df=1, max_df=1.0))

#each of these loops will add a full set of existing vectorizers with a single property changed
#each loop doubles the number of vectorizers
#to have more than two settings for a single feature, best to add manually to the inital append statements above (otherwise you will get duplicate entries)
    
""" leave out for now, have three values for this above    
count = len(vectorizers)
for i in range(count):
    newVect = duplicateVectorizer(vectorizers[i])
    newVect.ngram_range =(1,2)
    vectorizers.append(newVect)
"""
"""
count = len(vectorizers)
for i in range(count):
    newVect = duplicateVectorizer(vectorizers[i])
    newVect.stop_words='english'
    vectorizers.append(newVect)

count = len(vectorizers)
for i in range(count):
    newVect = duplicateVectorizer(vectorizers[i])
    newVect.min_df = 2
    vectorizers.append(newVect)

count = len(vectorizers)
for i in range(count):
    newVect = duplicateVectorizer(vectorizers[i])
    newVect.tokenizer = PorterTokenizer
    vectorizers.append(newVect)

count = len(vectorizers)
for i in range(count):
    newVect = duplicateVectorizer(vectorizers[i])
    newVect.max_df = 100
    vectorizers.append(newVect)
"""


models = []
models.append({'model':MultinomialNB(), 'name': 'Naive Bayes'})
"""
models.append({'model':LogisticRegression(), 'name': 'Logistic Regression'})
models.append({'model':svm.SVC(), 'name': 'Linear SVC'})
models.append({'model':RandomForestClassifier(n_estimators = 50), 'name': 'Random Forest'})
models.append({'model':KNeighborsClassifier(), 'name': 'K Neighbors'})
models.append({'model':DecisionTreeClassifier(), 'name': 'Decision Tree'})
"""


"\nmodels.append({'model':LogisticRegression(), 'name': 'Logistic Regression'})\nmodels.append({'model':svm.SVC(), 'name': 'Linear SVC'})\nmodels.append({'model':RandomForestClassifier(n_estimators = 50), 'name': 'Random Forest'})\nmodels.append({'model':KNeighborsClassifier(), 'name': 'K Neighbors'})\nmodels.append({'model':DecisionTreeClassifier(), 'name': 'Decision Tree'})\n"

In [13]:
filename='./output/output.csv'
f = open(filename, 'w')
loadAllDataFiles()
f.close()

with open(filename, 'r') as f:
    data = list(csv.reader(f))
from sklearn.feature_extraction import text 

display(HTML(tabulate.tabulate(data, tablefmt='html')))


                                                         

0,1,2,3,4,5,6
Tokenizer,Stop Words,nGram Range,Max Doc Frequency,Min Doc Frequency,Naive Bayes,
Lancaster,,(1- 1),1.0,1,0.446611909651,
Lancaster,,(1- 2),1.0,1,0.448665297741,
Lancaster,,(1- 3),1.0,1,0.433264887064,


## Scratchpad

In [None]:
def dumpData():
    filename= "./data/FinAid_Labeled.csv"
    df = pd.read_csv(filename)
    
    stopWords = customStopWords()
    vect = CountVectorizer(ngram_range=(1, 1), min_df=1, max_df=1.0)
    #vect.stop_words = stopWords
    #vect.analyzer='word'

    X = df.question
    y = df.Intent_Number
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    X_train_dtm = vect.fit_transform(X_train)
    print(vect.get_feature_names())
    #print(vect.token_pattern)
    #X_test_dtm = vect.transform(X_test)
    
    #X_train_dtm2 = vect.fit_transform(X_train)
 #   print(vect.get_feature_names())
    #print(vect.token_pattern)

    #vect.tokenizer = LancasterTokenizer()
    #X_train_dtm3 = vect.fit_transform(X_train)
    #print(vect.get_feature_names())
    #print(vect.vocabulary)
    #print(vect.token_pattern)
    #X_test_dtm2 = vect.transform(X_test)

    #pd.DataFrame( X_train_dtm.todense(),columns=vect.get_feature_names())
    #print(vect.get_feature_names())
    #print(vect.tokenizer)
    #print(vect.stop_words)
    
#dumpData()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vocab = ['The swimmer likes swimming so he swims. 1. ... "" 33']
vect = CountVectorizer().fit(vocab)

sentence1 = vec.transform(['The swimmer likes swimming.'])
sentence2 = vec.transform(['The swimmer swims.'])

print('Vocabulary: %s' %vec.get_feature_names())
print('Sentence 1: %s' %sentence1.toarray())
print('Sentence 2: %s' %sentence2.toarray())


def xtokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 


def xxxtokenize(text):
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems



#vect = CountVectorizer(tokenizer=tokenize, stop_words='english') 

vect.fit(vocab)

sentence1 = vect.transform(['The swimmer likes swimming. "" 1. 2 33'])
sentence2 = vect.transform(['The swimmer swims.'])

print('Vocabulary: %s' %vect.get_feature_names())
print('Sentence 1: %s' %sentence1.toarray())
print('Sentence 2: %s' %sentence2.toarray())
