In [1]:
# for Python 2: use print only as a function
#from __future__ import print_function

import pandas as pd

## 1. Load data files and check them out
this has the questions that were answered by the bot (intentions are the labels)

In [2]:
# example text for model training (SMS messages)
df = pd.read_csv("./data/FinAid_Labeled.csv")


In [3]:
df.shape

(743, 2)

In [4]:
df.columns.values

array(['question', 'Intent_Number'], dtype=object)

In [5]:
df.head()

Unnamed: 0,question,Intent_Number
0,I need to view my semester charges,1
1,How do I view my bill?,1
2,When do I need to apply for Financial Aid,2
3,When can I apply for fasfa for 2017?,2
4,when do i need to apply for financial aid for ...,2


In [6]:
df.describe(include=['object'])


Unnamed: 0,question
count,743
unique,741
top,Hello
freq,2


In [7]:
df.columns


Index(['question', 'Intent_Number'], dtype='object')

## 2. prepare single row of data to be ready for vectorizing

In [8]:
text=df['question'][55]
text

'How do I get my money for my books?'

In [9]:
from nltk.tokenize import word_tokenize, sent_tokenize
words = [word_tokenize(text) ]
print(words)


[['How', 'do', 'I', 'get', 'my', 'money', 'for', 'my', 'books', '?']]


In [10]:
from nltk.corpus import stopwords
from string import punctuation

customstopwords=set(stopwords.words('english')+list(punctuation))
wordsWOStopwords = [word for word in word_tokenize(text) if word not in customstopwords]
print(wordsWOStopwords)


['How', 'I', 'get', 'money', 'books']


In [11]:
from nltk.stem.lancaster import LancasterStemmer
st=LancasterStemmer()
stemmedwords=[st.stem(word) for word in wordsWOStopwords]
print (stemmedwords)

['how', 'i', 'get', 'money', 'book']


### put that all in a single method

#### globals for parseQuestion method

In [12]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.lancaster import LancasterStemmer

customstopwords=set(stopwords.words('english')+list(punctuation))
st=LancasterStemmer()

In [13]:
def parseQuestion(question):
    wordsWOStopwords = [word for word in word_tokenize(question) if word not in customstopwords]
    stemmedwords=[st.stem(word) for word in wordsWOStopwords]
    return( " ".join( stemmedwords ))   
    #return stemmedwords

In [14]:
print(parseQuestion(df['question'][77]))

what fin aid email


## 3. Loop through whole dataset parsing each row

In [15]:
df.question[77]

'What is financial aid email?'

In [16]:
# Get the number of questions based on the dataframe column size
num_questions = df.question.size

# Initialize an empty list to hold the clean reviews
all_questions = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_questions ):
    all_questions.append( parseQuestion( df.question[i] ) )

print (all_questions)

['i nee view semest charg', 'how i view bil', 'when i nee apply fin aid', 'when i apply fasf 2017', 'nee apply fin aid fal 2017 semest', 'when last day apply fil fafs next semest', 'i nee re-apply fin aid when deadlin', 'spring break', 'when', '', 'when fafs becom avail', 'can i get fin aid', 'los fin aid get academ prob', 'hi if acad3mic prob get fin aid', 'wil i get fin aid i academ suspend', 'can i get fin aid next semest', 'fsa id', 'how much aid i expect', 'how much fin aid i receiv', 'wher i see much fin aid i reciev click ban self-service', 'how much fin aid i get', 'fafs ask cous inform 2015 marry tim', 'what fin aid process look lik', 'how submit reconsid request', 'wher i find reconsid form fin aid', 'thi reconsid form', 'nee fin reconsid form upd fin inform pleas', 'when deadlin reconsid request', 'i look reconsid form', 'gradready', 'sum school fin aid deadlin', 'apply sum fin aid receiv', 'when sum fin aid apply bil', 'when sum fin aid apply bil', 'fin aid apply deadlin su

## 4. Create bag of words

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(all_questions)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

In [18]:
print (train_data_features.shape)


(743, 507)


### -------------------------------------------------------------------------------------------------------------------------------

## experiment with knn classifier
### got it trained, if I gave it a full vector (507 features) it should predict a classification

In [63]:
# import the class
from sklearn.neighbors import KNeighborsClassifier

# instantiate the model (with the default parameters)
knn = KNeighborsClassifier()

# fit the model with data (occurs in-place)
knn.fit(train_data_features, df["Intent_Number"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
knn.predict([[1,2,3,4,5]])

### -------------------------------------------------------------------------------------------------------------------------------



In [19]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print (vocab)

['1040', '1098', '1098t', '14', '18', '2015', '2016', '2017', '2018', '529', '5506', '687', '704', 'abl', 'abroad', 'acad3mic', 'academ', 'acceiv', 'access', 'account', 'act', 'addit', 'address', 'adio', 'adjust', 'admit', 'affect', 'agr', 'aid', 'alic', 'already', 'also', 'altern', 'am', 'amount', 'analys', 'and', 'anoth', 'answ', 'ap', 'aply', 'apply', 'appoint', 'approv', 'april', 'ar', 'ask', 'assocy', 'at', 'athlet', 'attend', 'autom', 'av', 'avail', 'award', 'away', 'baby', 'back', 'bal', 'ban', 'bank', 'becom', 'begin', 'belong', 'bet', 'bil', 'body', 'book', 'boost', 'break', 'breakdown', 'budget', 'build', 'bul', 'busy', 'button', 'buy', 'by', 'ca', 'cal', 'calc', 'camp', 'campus', 'can', 'cancel', 'cert', 'certain', 'chang', 'charg', 'charlotte', 'check', 'child', 'chuck', 'class', 'click', 'clos', 'cniditoin', 'cod', 'col', 'colleg', 'collerg', 'com', 'commun', 'complet', 'comput', 'condit', 'confus', 'consid', 'contact', 'cool', 'coolest', 'cop', 'cost', 'could', 'councel',

In [20]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (count, tag)

1 1040
2 1098
3 1098t
1 14
1 18
1 2015
2 2016
10 2017
2 2018
1 529
1 5506
1 687
1 704
2 abl
1 abroad
1 acad3mic
8 academ
14 acceiv
2 access
14 account
2 act
1 addit
34 address
1 adio
1 adjust
3 admit
1 affect
1 agr
254 aid
1 alic
1 already
1 also
4 altern
1 am
4 amount
1 analys
2 and
3 anoth
3 answ
9 ap
1 aply
74 apply
3 appoint
2 approv
1 april
7 ar
2 ask
1 assocy
1 at
1 athlet
6 attend
1 autom
2 av
9 avail
32 award
1 away
1 baby
1 back
5 bal
1 ban
1 bank
2 becom
2 begin
1 belong
1 bet
13 bil
1 body
9 book
1 boost
1 break
1 breakdown
1 budget
5 build
3 bul
3 busy
2 button
2 buy
1 by
2 ca
5 cal
1 calc
5 camp
4 campus
17 can
3 cancel
2 cert
1 certain
19 chang
1 charg
1 charlotte
7 check
1 child
3 chuck
7 class
2 click
2 clos
1 cniditoin
8 cod
3 col
4 colleg
1 collerg
3 com
1 commun
3 complet
1 comput
2 condit
1 confus
1 consid
22 contact
6 cool
3 coolest
3 cop
18 cost
2 could
1 councel
6 counsel
1 cours
1 cous
10 cov
5 credit
2 cur
1 dac
5 dat
2 day
1 de
33 deadlin
4 decid
1 declin
1 de

## 4. Start analyzing

### Random Forest

### Initialize only for labeled data
dont run this step for unlabeled

In [21]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, df["Intent_Number"] )

In [22]:
forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [23]:
# Apply the classifier we trained to the same data, just as a test
forest.predict(train_data_features)[0:10]

array([  1,   1,   2,   2,   2,   2,   2,   3,   3, 144])

In [24]:
# View the predicted probabilities of the first 10 observations
forest.predict_proba(train_data_features)[0:100]

array([[ 0.61      ,  0.        ,  0.01666667, ...,  0.        ,
         0.02      ,  0.        ],
       [ 0.56      ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.73      ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.05178571, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.06333333],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [33]:
train_data_features[742]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

## Do some predicting with unlabeled data

In [34]:
# Read the unlabeled data
df_unlabeled = pd.read_csv("./data/FinAid_Unlabeled.csv")

# Verify that there are 25,000 rows and 2 columns
print (df_unlabeled.shape)


(1366, 1)


In [44]:
# Create an empty list and append the clean reviews one by one
num_questions_unlabeled = len(df_unlabeled["question"])
questions_unlabeled = [] 

for i in range(0,num_questions_unlabeled):
    if( (i+1) % 100 == 0 ):
        print ("Question {} of {}".format(i+1, num_questions_unlabeled))
    questions_unlabeled.append( parseQuestion(df_unlabeled["question"][i]) )
    

Question 100 of 1366
Question 200 of 1366
Question 300 of 1366
Question 400 of 1366
Question 500 of 1366
Question 600 of 1366
Question 700 of 1366
Question 800 of 1366
Question 900 of 1366
Question 1000 of 1366
Question 1100 of 1366
Question 1200 of 1366
Question 1300 of 1366


In [47]:
print(len(questions_unlabeled))
questions_unlabeled[434]

1366


'how long tak receiv answ fin ap'

In [50]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(questions_unlabeled)
test_data_features = test_data_features.toarray()

test_data_features.shape

(1366, 507)

In [51]:

# Use the random forest to make intention number  predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "question" column and a "intent" column
output = pd.DataFrame( data={"question":df_unlabeled["question"], "intent":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

In [54]:
output.tail()

Unnamed: 0,intent,question
1361,50,Am I edible for grants after the march 1st faf...
1362,55,Whos is the contact to send a letter requestin...
1363,126,What is included in the room and board fees
1364,135,I would like to set up an appointment to come ...
1365,55,I just submitted my FAFSA application on the F...


From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect **numerical feature vectors with a fixed size** rather than the **raw text documents with variable length**.

We will use [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to "convert text into a matrix of token counts":

### This does all the bag-of-words and parsing work for you automatically
#### 1. Import

In [66]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

#### 2. Instantiate

In [256]:
# import and instantiate CountVectorizer (with the default parameters)
vect = CountVectorizer()

#### 3. Fit

In [257]:
# learn the 'vocabulary' of the training data (occurs in-place)
#all fit does is learn the vocab
vect.fit(df['question'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [258]:
# examine the fitted vocabulary
# shows you the vocab it learned
vect.get_feature_names()

['1040',
 '1098',
 '1098t',
 '14',
 '18',
 '2015',
 '2016',
 '2017',
 '2018',
 '529',
 '5506',
 '687',
 '704',
 'able',
 'about',
 'abroad',
 'acad3mic',
 'academic',
 'academically',
 'accept',
 'accepted',
 'access',
 'account',
 'accounting',
 'accounts',
 'actual',
 'actually',
 'additional',
 'address',
 'adios',
 'adjust',
 'admission',
 'admissions',
 'admit',
 'affected',
 'after',
 'again',
 'agreement',
 'aid',
 'aide',
 'aids',
 'alicia',
 'all',
 'already',
 'also',
 'alternative',
 'am',
 'amount',
 'an',
 'analytics',
 'and',
 'another',
 'answer',
 'answered',
 'answers',
 'any',
 'aply',
 'app',
 'appeal',
 'application',
 'applied',
 'apply',
 'applying',
 'appointment',
 'approved',
 'april',
 'are',
 'as',
 'ask',
 'asked',
 'associate',
 'at',
 'athletic',
 'attend',
 'attendance',
 'automated',
 'avail',
 'available',
 'average',
 'award',
 'awarded',
 'awarding',
 'awards',
 'away',
 'babies',
 'back',
 'balance',
 'bank',
 'banner',
 'be',
 'because',
 'become',


#### 4. Transform
role of tranform is to transform your data into a document-term-matrix

In [259]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(df['question'])
simple_train_dtm

<743x664 sparse matrix of type '<class 'numpy.int64'>'
	with 4712 stored elements in Compressed Sparse Row format>

can see above we got a matrix 743x664.  This means there are 743 questions, and a total fo 664 unique words in the corpus.  (TODO: why different word count than manual BoW above?)

In [260]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## This is our bag of words for training data:
aka document term matrix for labeled data

In [261]:
# examine the vocabulary and document-term matrix together
#this is our 'X' for training the models
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,1040,1098,1098t,14,18,2015,2016,2017,2018,529,...,worksheet,workshop,workstudy,would,year,yearly,yet,you,your,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> In this scheme, features and samples are defined as follows:

> - Each individual token occurrence frequency (normalized or not) is treated as a **feature**.
> - The vector of all the token frequencies for a given document is considered a multivariate **sample**.

> A **corpus of documents** can thus be represented by a matrix with **one row per document** and **one column per token** (e.g. word) occurring in the corpus.

> We call **vectorization** the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the **Bag of Words** or "Bag of n-grams" representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.

In [262]:
# check the type of the document-term matrix
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [263]:
# examine the sparse matrix contents
print(simple_train_dtm)

  (0, 125)	1
  (0, 377)	1
  (0, 379)	1
  (0, 528)	1
  (0, 601)	1
  (0, 634)	1
  (1, 97)	1
  (1, 195)	1
  (1, 302)	1
  (1, 377)	1
  (1, 634)	1
  (2, 38)	1
  (2, 61)	1
  (2, 195)	1
  (2, 250)	1
  (2, 255)	1
  (2, 379)	1
  (2, 601)	1
  (2, 644)	1
  (3, 7)	1
  (3, 61)	1
  (3, 117)	1
  (3, 235)	1
  (3, 255)	2
  (3, 644)	1
  :	:
  (738, 78)	1
  (738, 591)	1
  (738, 609)	1
  (738, 643)	1
  (739, 104)	1
  (739, 324)	1
  (739, 591)	1
  (739, 609)	1
  (739, 642)	1
  (740, 53)	1
  (740, 99)	1
  (740, 201)	1
  (740, 283)	1
  (740, 286)	1
  (740, 387)	1
  (740, 544)	1
  (741, 71)	1
  (741, 324)	1
  (741, 609)	1
  (741, 618)	1
  (741, 642)	1
  (742, 54)	1
  (742, 75)	1
  (742, 324)	1
  (742, 596)	1


From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> As most documents will typically use a very small subset of the words used in the corpus, the resulting matrix will have **many feature values that are zeros** (typically more than 99% of them).

> For instance, a collection of 10,000 short text documents (such as emails) will use a vocabulary with a size in the order of 100,000 unique words in total while each document will use 100 to 1000 unique words individually.

> In order to be able to **store such a matrix in memory** but also to **speed up operations**, implementations will typically use a **sparse representation** such as the implementations available in the `scipy.sparse` package.

## Now lets load up the test data
this is the unlabeled questions, we will try to predict these

In [264]:
# example text for model testing
simple_test = df_unlabeled['question']

In order to **make a prediction**, the new observation must have the **same features as the training observations**, both in number and meaning.

transform the data into dtm.  No need to fit here, that is done by training data and we are using that vocab to predict these questions

In [265]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [266]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,1040,1098,1098t,14,18,2015,2016,2017,2018,529,...,worksheet,workshop,workstudy,would,year,yearly,yet,you,your,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Summary:**

- `vect.fit(train)` **learns the vocabulary** of the training data
- `vect.transform(train)` uses the **fitted vocabulary** to build a document-term matrix from the training data
- `vect.transform(test)` uses the **fitted vocabulary** to build a document-term matrix from the testing data (and **ignores tokens** it hasn't seen before)

**Notes**

- drop unknown words from test dtm because they have no predictive value (since they arent known to the model)

## Part 3: Reading a text-based dataset into pandas

In [250]:
# read file into pandas using a relative path
#path = 'data/sms.tsv'
#sms = pd.read_table(path, header=None, names=['label', 'message'])

#swap in our df
sms = df
sms.rename(columns = {'label': 'Intent_Number'}, inplace=True)
sms.columns

Index(['question', 'Intent_Number'], dtype='object')

In [251]:
# alternative: read file into pandas from a URL
# url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
# sms = pd.read_table(url, header=None, names=['label', 'message'])

In [252]:
# examine the shape
sms.shape

(743, 2)

In [253]:
# examine the first 10 rows
sms.head(10)

Unnamed: 0,question,Intent_Number
0,I need to view my semester charges,1
1,How do I view my bill?,1
2,When do I need to apply for Financial Aid,2
3,When can I apply for fasfa for 2017?,2
4,when do i need to apply for financial aid for ...,2
5,When is the last day to apply fill out the faf...,2
6,I need to re-apply for financial aid. When is ...,2
7,when is spring break,3
8,When can,3
9,when,3


In [156]:
# examine the class distribution
sms.Intent_Number.value_counts()

144    132
55     113
90      21
49      21
135     20
50      14
42      12
32      11
58      11
128      9
56       9
59       8
149      8
150      8
72       8
157      8
51       7
141      7
80       7
105      7
121      6
60       6
101      6
27       6
22       6
9        6
11       5
34       5
39       5
2        5
      ... 
156      1
109      1
106      1
104      1
88       1
91       1
92       1
93       1
94       1
96       1
73       1
69       1
131      1
67       1
41       1
130      1
127      1
48       1
124      1
122      1
52       1
53       1
118      1
117      1
116      1
115      1
114      1
65       1
66       1
123      1
Name: Intent_Number, Length: 157, dtype: int64

In [158]:
# convert label to a numerical variable
#sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

#dont need this step, our intentions are already numeric

In [159]:
# check that the conversion worked
sms.head(10)

Unnamed: 0,question,Intent_Number
0,I need to view my semester charges,1
1,How do I view my bill?,1
2,When do I need to apply for Financial Aid,2
3,When can I apply for fasfa for 2017?,2
4,when do i need to apply for financial aid for ...,2
5,When is the last day to apply fill out the faf...,2
6,I need to re-apply for financial aid. When is ...,2
7,when is spring break,3
8,When can,3
9,when,3


### Create X and y for use with model
> Question is our data set for training, and intent_number is our response.  We can just use column values as X and Y, they will work with CountVectorizer.  Ok that X is one dimensional for now, CountVectorizer.transform will convert it to two dimensional

In [254]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.question
y = sms.Intent_Number
print(X.shape)
print(y.shape)

(743,)
(743,)


In [255]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(557,)
(186,)
(557,)
(186,)


## Part 4: Vectorizing our dataset

In [180]:
# instantiate the vectorizer
vect = CountVectorizer()

- ###  try other instantiations

In [340]:
vect = CountVectorizer(stop_words='english', ngram_range=(1, 3),min_df=2)
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [341]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

next cell does same thing as previous cell and is more efficient (and more typical)

In [342]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [343]:
# examine the document-term matrix
X_train_dtm

<557x413 sparse matrix of type '<class 'numpy.int64'>'
	with 2577 stored elements in Compressed Sparse Row format>

testing data is same width because it uses the training data feature set (runs test looking for same words that were in training data)

In [344]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<186x413 sparse matrix of type '<class 'numpy.int64'>'
	with 711 stored elements in Compressed Sparse Row format>

## Part 5: Building and evaluating a model

We will use [multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html):

> The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [345]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [346]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 3.19 ms, sys: 1.41 ms, total: 4.6 ms
Wall time: 3.81 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [347]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

### yikes, only predicts correctly 27% of the time

In [348]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.32795698924731181

## Confusion Matrix
- each column corresponds to a feature predicted value
- each row corresponds to a feature actual value
- every prediction fits somewhere on that table since every prediction has a predicted value and an actual value
- "true positive", "false positive" terminology really only makes sense when there are only 2 classes

In [349]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [350]:
# print message text for the false positives (ham incorrectly classified as spam)


In [351]:
# print message text for the false negatives (spam incorrectly classified as ham)


In [352]:
# example false negative
X_test[3132]

KeyError: 3132

### this shows probability that each row is in each class
btw, naive bayes not precise with probabilities. Good classifier, not as good at predictive probabilities

In [353]:
nb.predict_proba(X_test_dtm)

array([[  8.91153270e-04,   2.75565605e-03,   2.57866526e-03, ...,
          8.84711224e-04,   8.78331119e-04,   5.08400465e-03],
       [  1.79533214e-03,   7.18132855e-03,   5.38599641e-03, ...,
          1.79533214e-03,   1.79533214e-03,   1.07719928e-02],
       [  8.45018609e-07,   1.67231795e-04,   2.44516875e-06, ...,
          8.38910065e-07,   8.32860256e-07,   4.82080769e-06],
       ..., 
       [  1.79533214e-03,   7.18132855e-03,   5.38599641e-03, ...,
          1.79533214e-03,   1.79533214e-03,   1.07719928e-02],
       [  1.79533214e-03,   7.18132855e-03,   5.38599641e-03, ...,
          1.79533214e-03,   1.79533214e-03,   1.07719928e-02],
       [  6.98512714e-11,   5.37947398e-09,   3.54126432e-10, ...,
          1.35051763e-10,   6.52832377e-11,   3.31235002e-10]])

### show same thing, just for first class

In [354]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]  #[all rows:column 1]
y_pred_prob

array([  2.75565605e-03,   7.18132855e-03,   1.67231795e-04,
         5.20528544e-03,   6.80106710e-03,   8.34200576e-03,
         6.16787731e-03,   8.47212264e-03,   4.75356605e-03,
         1.38597569e-02,   4.18961434e-03,   7.18132855e-03,
         2.13283476e-08,   1.13535327e-07,   7.44670698e-03,
         7.18132855e-03,   7.18132855e-03,   7.70802832e-03,
         1.79596709e-08,   7.08129485e-03,   1.56685882e-03,
         7.18132855e-03,   1.67231795e-04,   2.10093272e-04,
         7.80073766e-03,   5.17078201e-03,   2.58718225e-03,
         9.19813526e-05,   7.99376919e-03,   3.35498214e-03,
         2.09873723e-06,   6.13786269e-03,   6.28193166e-04,
         3.57047111e-03,   5.61953112e-03,   3.63635660e-03,
         7.18132855e-03,   8.61482042e-04,   5.66473878e-03,
         6.69193053e-03,   9.43322849e-03,   7.12825629e-06,
         5.66473878e-03,   5.17078201e-03,   8.17688177e-05,
         7.46031268e-03,   2.79039405e-03,   4.90177205e-03,
         1.53827987e-03,

In [355]:
# calculate AUC  (area under the curve)
metrics.roc_auc_score(y_test, y_pred_prob)

ValueError: multiclass format is not supported

## Part 6: Comparing models

### Logistic Regression

We will compare multinomial Naive Bayes with [logistic regression](http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression):

> Logistic regression, despite its name, is a **linear model for classification** rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.

In [356]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [357]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 72.7 ms, sys: 2.76 ms, total: 75.4 ms
Wall time: 81.9 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [358]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [359]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([ 0.00441734,  0.00561398,  0.00616018,  0.00461496,  0.00513524,
        0.00551242,  0.00468016,  0.00566849,  0.00508452,  0.01140142,
        0.00459646,  0.00561398,  0.00418529,  0.00669058,  0.00538839,
        0.00561398,  0.00561398,  0.00552025,  0.00272419,  0.00550533,
        0.00481968,  0.00561398,  0.00616018,  0.00544048,  0.00512549,
        0.00476259,  0.00472122,  0.00384599,  0.00512075,  0.00553264,
        0.00319095,  0.00505816,  0.00627413,  0.00451949,  0.00483045,
        0.00474324,  0.00561398,  0.00442014,  0.00501416,  0.00496637,
        0.01188418,  0.00292861,  0.00501416,  0.00476259,  0.00594122,
        0.00512063,  0.01482964,  0.00446316,  0.00401875,  0.00487955,
        0.00536885,  0.00442014,  0.00616018,  0.01034607,  0.00400406,
        0.00603052,  0.01555315,  0.00604261,  0.00448492,  0.01215469,
        0.00503692,  0.00469965,  0.00519469,  0.00509907,  0.00561398,
        0.00561398,  0.00344822,  0.00555048,  0.00220865,  0.04

In [360]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.40322580645161288

### Better!

In [361]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

ValueError: multiclass format is not supported

## Linear SVC

In [362]:
# import and instantiate a logistic regression model
from sklearn import svm
linearSVC = svm.SVC()

In [363]:
# train the model using X_train_dtm
%time linearSVC.fit(X_train_dtm, y_train)

CPU times: user 81.9 ms, sys: 3.34 ms, total: 85.3 ms
Wall time: 91.7 ms


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [364]:
# make class predictions for X_test_dtm
y_pred_class = linearSVC.predict(X_test_dtm)

In [365]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = linearSVC.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

AttributeError: predict_proba is not available when  probability=False

In [366]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.22043010752688172

### Worse!

In [367]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 50) 


In [368]:
# train the model using X_train_dtm
%time forest.fit(X_train_dtm, y_train)

CPU times: user 149 ms, sys: 18 ms, total: 167 ms
Wall time: 168 ms


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [369]:
# make class predictions for X_test_dtm
y_pred_class = forest.predict(X_test_dtm)

In [370]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = forest.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.02,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.04,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,

In [371]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.38709677419354838

### best one yet

## Part 7: Examining a model for further insight

We will examine the our **trained Naive Bayes model** to calculate the approximate **"spamminess" of each token**.

In [None]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

In [None]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

In [None]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

In [None]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

In [None]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

In [None]:
# number of times each token appears across all HAM messages
ham_token_count = nb.feature_count_[0, :]
ham_token_count

In [None]:
# number of times each token appears across all SPAM messages
spam_token_count = nb.feature_count_[1, :]
spam_token_count

In [None]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'ham':ham_token_count, 'spam':spam_token_count}).set_index('token')
tokens.head()

In [None]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=6)

In [None]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

Before we can calculate the "spamminess" of each token, we need to avoid **dividing by zero** and account for the **class imbalance**.

In [None]:
# add 1 to ham and spam counts to avoid dividing by 0
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens.sample(5, random_state=6)

In [None]:
# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5, random_state=6)

In [None]:
# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=6)

In [None]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('spam_ratio', ascending=False)

In [None]:
# look up the spam_ratio for a given token
tokens.loc['dating', 'spam_ratio']

## Part 8: Practicing this workflow on another dataset

Please open the **`exercise.ipynb`** notebook (or the **`exercise.py`** script).

## Part 9: Tuning the vectorizer (discussion)

Thus far, we have been using the default parameters of [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html):


TODO - BILL, try these

In [None]:
# show default parameters for CountVectorizer
vect

However, the vectorizer is worth tuning, just like a model is worth tuning! Here are a few parameters that you might want to tune:

- **stop_words:** string {'english'}, list, or None (default)
    - If 'english', a built-in stop word list for English is used.
    - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
    - If None, no stop words will be used.

In [None]:
# remove English stop words
vect = CountVectorizer(stop_words='english')

- **ngram_range:** tuple (min_n, max_n), default=(1, 1)
    - The lower and upper boundary of the range of n-values for different n-grams to be extracted.
    - All values of n such that min_n <= n <= max_n will be used.

In [None]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))

- **max_df:** float in range [0.0, 1.0] or int, default=1.0
    - When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    - If float, the parameter represents a proportion of documents.
    - If integer, the parameter represents an absolute count.

In [None]:
# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)

- **min_df:** float in range [0.0, 1.0] or int, default=1
    - When building the vocabulary, ignore terms that have a document frequency strictly lower than the given threshold. (This value is also called "cut-off" in the literature.)
    - If float, the parameter represents a proportion of documents.
    - If integer, the parameter represents an absolute count.

In [None]:
# only keep terms that appear in at least 2 documents
vect = CountVectorizer(min_df=2)

**Guidelines for tuning CountVectorizer:**

- Use your knowledge of the **problem** and the **text**, and your understanding of the **tuning parameters**, to help you decide what parameters to tune and how to tune them.
- **Experiment**, and let the data tell you the best approach!