In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import metrics

#for displaying result
import csv
from IPython.display import HTML, display
import tabulate

#these two lines changes jupyter's variable display to put each variable on its own line
#that way, we can dump mulitple variables from a single code cell (without them overwriting the previous)
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"



## Data Files

In [2]:
def loadAllDataFiles():
    filenames=[]
#    filenames.append("./data/FinAid_Labeled.csv")
    filenames.append("./data/Career_Labeled.csv")
    dumpColumnTitles()
    for file in filenames:
        X_train, X_test, y_train, y_test = loadOneFile(file)
        runAllVectorizers(X_train, X_test, y_train, y_test)


def loadOneFile(filename):
    df = pd.read_csv(filename)

    X = df.question
    y = df.Intent_Number
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    return X_train, X_test, y_train, y_test


def dumpColumnTitles():
    f.write("Stop Words, nGram Range, Max Doc Frequency, Min Doc Frequency, ")
    for model in models:
        f.write(model['name'] + ', ')
    f.write ('\n')

##  Vectorizers

In [3]:
def runAllVectorizers(X_train, X_test, y_train, y_test):
    i = 0
    for vect in vectorizers:
        i += 1
        print ("Running Vectorizer {} of {}".format(i, len(vectorizers)), end="\r")
        X_train_dtm, X_test_dtm = runOneVectorizer(vect,X_train, X_test, y_train, y_test)
        runAllModels(X_train_dtm, X_test_dtm, y_train, y_test)
        f.write('\n')
    print("                                                         ", end="\r")
    

def runOneVectorizer(vect,X_train, X_test, y_train, y_test):
    # learn training data vocabulary, then use it to create a document-term matrix
    X_train_dtm = vect.fit_transform(X_train)

    # transform testing data (using fitted vocabulary) into a document-term matrix
    X_test_dtm = vect.transform(X_test)

    if vect.stop_words is None:
        f.write("None")
    else:
        f.write(str(vect.stop_words))
    f.write(',')
    f.write(str(vect.ngram_range).replace(',','-'))
    f.write(',')
    f.write(str(vect.max_df))
    f.write(',')
    f.write(str(vect.min_df))
    f.write(',')
    
    return X_train_dtm, X_test_dtm


## Models

In [4]:
def runAllModels(X_train_dtm, X_test_dtm, y_train, y_test):
    for model in models:
        runOneModel(model['model'],X_train_dtm, X_test_dtm, y_train, y_test)


def runOneModel(model,X_train_dtm, X_test_dtm, y_train, y_test):
    # train the model using X_train_dtm (timing it with an IPython "magic command")
    #%time model.fit(X_train_dtm, y_train)
    model.fit(X_train_dtm, y_train)

    # make class predictions for X_test_dtm
    y_pred_class = model.predict(X_test_dtm)

    # calculate accuracy of class predictions
    f.write(str(metrics.accuracy_score(y_test, y_pred_class)))
    f.write(',')    
    

## Initialize Vectorizers & Models, then Run

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#tweak the vectorizing settings here:
vectorizers=[]
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 1), min_df=1, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=1, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 2), min_df=1, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=1, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 1), min_df=2, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 2), min_df=2, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=1.0))
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 1), min_df=1, max_df= 30))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=1, max_df= 30))
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 2), min_df=1, max_df= 30))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=1, max_df= 30))
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 1), min_df=2, max_df= 30))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df= 30))
vectorizers.append(CountVectorizer(stop_words= None,     ngram_range=(1, 2), min_df=2, max_df= 30))
vectorizers.append(CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df= 30))

models = []
models.append({'model':MultinomialNB(), 'name': 'Naive Bayes'})
models.append({'model':LogisticRegression(), 'name': 'Logistic Regression'})
models.append({'model':svm.SVC(), 'name': 'Linear SVC'})
models.append({'model':RandomForestClassifier(n_estimators = 50), 'name': 'Random Forest'})

In [55]:
filename='output.csv'
f = open(filename, 'w')
loadAllDataFiles()
f.close()

with open(filename, 'r') as f:
    data = list(csv.reader(f))

display(HTML(tabulate.tabulate(data, tablefmt='html')))


                                                         

0,1,2,3,4,5,6,7,8
Stop Words,nGram Range,Max Doc Frequency,Min Doc Frequency,Naive Bayes,Logistic Regression,Linear SVC,Random Forest,
,(1- 1),1.0,1,0.441478439425,0.662217659138,0.223819301848,0.640657084189,
english,(1- 1),1.0,1,0.459958932238,0.616016427105,0.223819301848,0.583162217659,
,(1- 2),1.0,1,0.485626283368,0.656057494867,0.223819301848,0.621149897331,
english,(1- 2),1.0,1,0.515400410678,0.625256673511,0.223819301848,0.573921971253,
,(1- 1),1.0,2,0.492813141684,0.659137577002,0.224845995893,0.630390143737,
english,(1- 1),1.0,2,0.477412731006,0.613963039014,0.223819301848,0.588295687885,
,(1- 2),1.0,2,0.530800821355,0.650924024641,0.223819301848,0.632443531828,
english,(1- 2),1.0,2,0.542094455852,0.621149897331,0.223819301848,0.591375770021,
,(1- 1),30,1,0.29568788501,0.364476386037,0.223819301848,0.345995893224,
