# Working with Text Data in scikit-learn

## Agenda

1. Model building in scikit-learn (refresher)
2. Representing text as numerical data
3. Reading the SMS data
4. Vectorizing the SMS data
5. Building a Naive Bayes model
6. Comparing Naive Bayes with logistic regression
7. Calculating the "spamminess" of each token
8. Creating a DataFrame from individual text files

In [1]:
# use print only as a function  ## Don't need to run this if in Python 3
from __future__ import print_function

## Part 1: Create the DTM with an ETL process

In [28]:
__author__ = 'swe03'

import numpy as np
import pandas as pd

desired_width = 250
pd.set_option('display.width',desired_width)

In [29]:
# Read in the csv files created in SQL Assistant, downloaded locally, and saved to C:\NLP_Files

co_notes = pd.read_csv("C:\\NLP_Files\\cancelled_ord_notes2.csv")
#co_notes.head(2)
#type(co_notes)

done_notes = pd.read_csv("C:\\NLP_Files\\done_ord_notes2.csv")
#done_notes



In [30]:
# Add the Target column and value
co_notes['order'] = 0
#co_notes.head(1)
#type(co_notes["order"])  # This is a Series 
#print(co_notes["order"].dtype) # This is an int56
#type(co_notes["CUST_ORD_NBR"])  # This is a Series

done_notes['order'] = 1
#done_notes

In [31]:
# I would rather use SQL for the data manipulation so.... installed the pandasql package via the Conda CLI interface 
# (i.e., Anaconda Prompt) 

from pandasql import PandaSQL 
pdsql = PandaSQL()
#type(pdsql)

## These are just a few illustrative examples of using the SQL functionality
#pdsql("SELECT * from done_notes limit 5;",locals())
#pdsql("""SELECT d.cust_ord_nbr, d.svc_line_nbr, d.crt_dt, d.stat_eff_ts, d.svc_typ_cd, d.svc_stat_cd, d.user_aud_id, d."order" from done_notes d """,locals())
#all_notes = pdsql("""SELECT d.cust_ord_nbr, d.svc_line_nbr, d.crt_dt, d.stat_eff_ts, d.svc_typ_cd, d.svc_stat_cd, d.user_aud_id, d."order",
#                   c."order" From done_notes d Join co_notes c On  d.cust_ord_nbr = c.cust_ord_nbr and d.str_loc_id   = d.str_loc_id""", locals())

all_notes = pd.concat([done_notes, co_notes])    ## Concatenate the two dataframes

## Create a compound unique key for Customer and Location
all_notes["CUST_LOC"] = all_notes.CUST_ORD_NBR.astype(str).str.cat(all_notes.STR_LOC_ID.astype(str), sep='_')
#all_notes

## Create some indexes
#all_notes_i = all_notes.set_index(['CUST_ORD_NBR','STR_LOC_ID'])  ## Hierarchical Index
all_notes_i = all_notes.set_index(['CUST_LOC'])                   ## Single Index with combined Columns(created above)
#idx = all_notes_i.index
all_notes_i





Unnamed: 0_level_0,CUST_ORD_NBR,SVC_LINE_NBR,SVC_NOTE_NBR,STR_LOC_ID,LAST_UPD_SNSH_ID,LAST_UPD_TS,SRC_CRT_TS,CRT_USER_AUD_ID,TKLR_BIN_CD,FLLW_UP_DT,...,SRC_LAST_UPD_TS,SVC_NOTE_TXT,PREV_TKLR_BIN_CD,TBIN_UPD_USER_ID,TBIN_UPD_TS,CRT_SRC_MOD_ID,UPD_SRC_MOD_ID,ASSG_ASSOC_USER_ID,ACTV_FLG,order
CUST_LOC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
660721_258,660721,40,1,258,4657408,13May2016 18:46:52.584,23Apr2016 17:53:38.001,JLH1971,4,,...,13May2016 8:40:43.555,2nd Call to Reschedule - 7 days- Left the cust...,-1,,,,,,Y,1
660721_258,660721,40,2,258,4657408,13May2016 18:46:52.647,23Apr2016 17:53:58.001,JLH1971,4,,...,13May2016 8:40:43.555,2nd Call to Reschedule - 7 days- See previous ...,-1,,,,,,Y,1
664440_258,664440,1,2,258,4673426,18May2016 7:50:57.715,28Apr2016 12:53:55.001,SYSCOM,4,,...,17May2016 15:00:56.227,** FROM VNDR ATTN:S0101 RESCHEDULED - DUE TO O...,-1,,,SYSCOM,,,Y,1
660721_258,660721,40,3,258,4657408,13May2016 18:46:52.659,29Apr2016 12:49:06.001,TAM5409,4,,...,13May2016 8:40:43.555,2nd Call to Reschedule - 7 days- See previous ...,-1,,,,,,Y,1
660721_258,660721,40,4,258,4657408,13May2016 18:46:52.648,29Apr2016 12:49:30.001,TAM5409,4,,...,13May2016 8:40:43.555,2nd Call to Reschedule - 7 days- See previous ...,-1,,,,,,Y,1
664440_258,664440,1,3,258,4673426,18May2016 7:50:57.717,29Apr2016 18:06:10.001,SYSCOM,4,,...,17May2016 15:00:56.227,** FROM VNDR ATTN:S0101 PRODUCT SHIPPED; EXPEC...,-1,,,SYSCOM,,,Y,1
664440_258,664440,2,1,258,4673426,18May2016 7:50:57.716,03May2016 13:53:27.001,INSTLR,5,03May2016,...,17May2016 15:00:56.227,**FROM D23 SVC PROV:Carpet was received damage...,-1,,,sv,,,Y,1
664440_258,664440,2,2,258,4673426,18May2016 7:50:57.711,03May2016 14:19:24.001,CMV407,3,03May2016,...,17May2016 15:00:56.227,Thank you!!,-1,,,sv,,,Y,1
660721_258,660721,21,2,258,4657408,13May2016 18:46:52.799,03May2016 18:25:23.001,SYSCOM,4,,...,13May2016 8:40:43.555,** FROM VNDR ATTN:S2136 PRODUCT SHIPPED; EXPEC...,-1,,,SYSCOM,,,Y,1
664440_258,664440,2,3,258,4673426,18May2016 7:50:57.718,04May2016 13:42:05.001,INSTLR,5,04May2016,...,17May2016 15:00:56.227,"**FROM D23 SVC PROV:no prob : XML,SYSTEM",-1,,,sv,,,Y,1


## Create the First and Last flags to control main processing step

In [32]:
### Select the relevant set of variables
all_notes_2 = pdsql("""SELECT cust_loc,"order",svc_note_txt from all_notes""",locals())
#all_notes_2.head(10)

### Create the first and last flags
all_notes_f = all_notes_2.groupby(['CUST_LOC']).first()
all_notes_f['first_flg'] = 1
#Flag_FRecs.reset_index(inplace=True)
##Flag_FRecs.ix[:2,['post_visid','date_time','first_flg','last_flg']]

all_notes_l = all_notes_2.groupby(['CUST_LOC']).last()
all_notes_l['last_flg'] = 1
#Flag_LRecs.reset_index(inplace=True)
##Flag_LRecs.ix[:2,['post_visid','date_time','first_flg','last_flg']]

#print(all_notes_f)
#print(all_notes_l)

dataframes = [all_notes_f, all_notes_l]  # This is a List to just create the flag_all dataframe 
flag_all = pd.concat(dataframes)  # This is a dataframe
flag_all

Unnamed: 0_level_0,SVC_NOTE_TXT,first_flg,last_flg,order
CUST_LOC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
589107_258,"Claudia, contacted Jason and customer is sched...",1.0,,0
649373_258,Appliance Ordering System - This Order Status ...,1.0,,0
657612_258,"**FROM D30 SVC PROV:LEFT MESSAGE : XML,SYSTEM",1.0,,0
657866_258,** READ BEFORE SALE ** Warning: Stairs may be ...,1.0,,0
659201_258,Add trip charge labor,1.0,,0
660721_258,2nd Call to Reschedule - 7 days- Left the cust...,1.0,,1
661105_258,** READ BEFORE SALE ** Warning: Inform custome...,1.0,,0
661227_258,Measure has been confirmed with vendor.,1.0,,0
662261_258,** FROM VNDR ATTN:S0101 PRODUCT SHIPPED; EXPEC...,1.0,,0
662866_258,**FROM D30 SVC PROV:Phantom was given a Chicag...,1.0,,0


In [33]:
Final_Recs = pdsql("""Select a.*, b.first_flg, b.last_flg   
       From all_notes_2 a   
       left outer join flag_all b
       On  a.cust_loc = b.cust_loc
       and a.svc_note_txt = b.svc_note_txt""", locals())
       
Final_Recs

Unnamed: 0,CUST_LOC,order,SVC_NOTE_TXT,first_flg,last_flg
0,660721_258,1,2nd Call to Reschedule - 7 days- Left the cust...,1,
1,660721_258,1,2nd Call to Reschedule - 7 days- See previous ...,,
2,664440_258,1,** FROM VNDR ATTN:S0101 RESCHEDULED - DUE TO O...,1,
3,660721_258,1,2nd Call to Reschedule - 7 days- See previous ...,,
4,660721_258,1,2nd Call to Reschedule - 7 days- See previous ...,,
5,664440_258,1,** FROM VNDR ATTN:S0101 PRODUCT SHIPPED; EXPEC...,,
6,664440_258,1,**FROM D23 SVC PROV:Carpet was received damage...,,
7,664440_258,1,Thank you!!,,
8,660721_258,1,** FROM VNDR ATTN:S2136 PRODUCT SHIPPED; EXPEC...,,
9,664440_258,1,"**FROM D23 SVC PROV:no prob : XML,SYSTEM",,1


In [None]:
#print(type(all_notes_2))
#print(all_notes_2)
#all_notes_t = all_notes_2.T
#all_notes_t
#all_notes_t["all_txt"] = [' '.join(row) for row in all_notes_t[all_notes_t.columns[:]].values]   ## Did not work
#all_notes_t

In [34]:
temp_list = []
svcs_notes = pd.DataFrame()

                     

# Loop through the records to create the ADS
n = 0
for index, row in Final_Recs.iterrows():  # if you don't include index then "TypeError: tuple indices must be integers or slices, not str"
    #print('top o the for statement')
    #print('evar4 =', (row['evar4 ']))
    #print ('first_flg=', CR_merge['first_flg_y'][n])
    
    if row['first_flg'] == 1 :  # This is to start the build of the Cust_Loc single notes variable
        temp_list = []
        temp_list.append(row['SVC_NOTE_TXT'])
    elif row['last_flg'] == 1:
        #print('in the last flag and cust_loc = ', row['CUST_LOC'])
        temp_list.append(row['SVC_NOTE_TXT'])
        temp_dict = {'CUST_LOC': row['CUST_LOC'], 'order': row['order'], 'all_notes':[temp_list]   }
        df1 = pd.DataFrame(temp_dict)
        svcs_notes = svcs_notes.append(df1,ignore_index=True)
        temp_list = []
    else:
        temp_list.append(row['SVC_NOTE_TXT'])
        
svcs_notes.to_csv("C:\\NLP_Files\\svcs_notes.csv")

##type(svcs_notes)  This is a dataframe

## Instructive
## df1 = DataFrame({'test_set': [test_set]})
## print str(list_of_ints).strip('[]')    ## where list_of_ints was a lIST dtype
## svcs_notes.all_notes.apply(str)

In [35]:
#temp_list
type(svcs_notes)
##pd.options.display.max_colwidth = 500
svcs_notes

Unnamed: 0,CUST_LOC,all_notes,order
0,664440_258,[** FROM VNDR ATTN:S0101 RESCHEDULED - DUE TO ...,1
1,666793_258,[** FROM VNDR ATTN:S0101 PRODUCT SHIPPED; EXPE...,1
2,660721_258,[Cust. said she already picked up these items ...,1
3,667997_258,[Appliance Ordering System - This Order Status...,1
4,667997_258,[Appliance Ordering System - This Order Status...,1
5,669201_258,[David/SSD Supervisor is getting with Dayzetta...,1
6,669704_258,[Appliance Ordering System - This Order Was Su...,1
7,669704_258,[Appliance Ordering System - This Order Status...,1
8,662866_258,[** READ BEFORE SALE ** Warning: Inform custom...,0
9,665180_258,[Mike-why did you do this as a ship to store o...,0


## Part 1: Model building in scikit-learn (refresher)

In order to **build a model**, the features must be **numeric**, and every observation must have the **same features in the same order**.

In order to make a **prediction**, the new observation must have the **same features as the training observations**, both in number and meaning.

In [17]:
# store the feature matrix (X) and response vector (y).  The pdsql function automatically creates a DataFrame
# This was just trying to see if I could load the List into a Feature Matrix.  The pdsql failed since the list object
# svc_note_txt is an unsupported type for pdsql
## X = pdsql("""SELECT svc_note_txt,cust_loc from out""",locals())
## y = pdsql("""SELECT "order" from out""",locals())

## Part 2: Representing text as numerical data

In [8]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']

From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect **numerical feature vectors with a fixed size** rather than the **raw text documents with variable length**.

We will use [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to "convert text into a matrix of token counts":

In [9]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [10]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
# examine the fitted vocabulary
vect.get_feature_names()

[u'cab', u'call', u'me', u'please', u'tonight', u'you']

In [12]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<type 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [13]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [14]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> In this scheme, features and samples are defined as follows:

> - Each individual token occurrence frequency (normalized or not) is treated as a **feature**.
> - The vector of all the token frequencies for a given document is considered a multivariate **sample**.

> A **corpus of documents** can thus be represented by a matrix with **one row per document** and **one column per token** (e.g. word) occurring in the corpus.

> We call **vectorization** the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the **Bag of Words** or "Bag of n-grams" representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.

In [15]:
# print the sparse matrix
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> As most documents will typically use a very small subset of the words used in the corpus, the resulting matrix will have **many feature values that are zeros** (typically more than 99% of them).

> For instance, a collection of 10,000 short text documents (such as emails) will use a vocabulary with a size in the order of 100,000 unique words in total while each document will use 100 to 1000 unique words individually.

> In order to be able to **store such a matrix in memory** but also to **speed up operations**, implementations will typically use a **sparse representation** such as the implementations available in the `scipy.sparse` package.

In [16]:
# example text for model testing
simple_test = ["please don't call me"]

In order to make a **prediction**, the new observation must have the **same features as the training observations**, both in number and meaning.

In [17]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [18]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


**Summary:**

- `vect.fit(train)` **learns the vocabulary** of the training data
- `vect.transform(train)` uses the **fitted vocabulary** to build a document-term matrix from the training data
- `vect.transform(test)` uses the **fitted vocabulary** to build a document-term matrix from the testing data (and **ignores tokens** it hasn't seen before)

# Part 3: Reading the Home Services Data data
## This is where I deviate from the MLtext2 class since I'm using Home Depot data

In [11]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [12]:
## Creating a new csv above (in creating the ADS section) solved a problem when the vect.fit below was initially trying to read 
## the dataframe created above (also in the creating the ADS section) 
svcs_notes2 = pd.read_csv("C:\\NLP_Files\\svcs_notes.csv")

In [13]:
# examine the shape
svcs_notes2.shape

(2, 4)

In [14]:
# examine the first 10 rows
print(svcs_notes2.head(5))
type(svcs_notes2)
svcs_notes2.all_notes.dtype  # this is '0'
#svcs_notes.order.dtype   # this is 'int64'

   Unnamed: 0     CUST_LOC                                          all_notes  order
0           0  643,037_258  ['Customer came in to check on the ETA of the ...      1
1           1  631,853_258  ['11/12 Chassidy will be coming into the store...      0


dtype('O')

In [15]:
# examine the class distribution
svcs_notes2.order.value_counts()

1    1
0    1
Name: order, dtype: int64

In [62]:
# convert label to a numerical variable (only if the dependent variable is non-numeric.... so irrelevant for me)
##sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [16]:
# required way to define X and y for use with COUNTVECTORIZER
X = svcs_notes2.all_notes
y = svcs_notes2.order
print(X.shape)
print(y.shape)

(2,)
(2,)


In [17]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(X_train)

(1,)
(1,)
1    ['11/12 Chassidy will be coming into the store...
Name: all_notes, dtype: object


## Part 4: Vectorizing the Services data

In [18]:
# instantiate the vectorizer
vect = CountVectorizer()

In [19]:
# learn training data vocabulary, then use it to create a document-term matrix
#type(X_train)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [20]:
# alternative: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [23]:
# examine the fitted vocabulary
vect.get_feature_names()

['01',
 '02',
 '03',
 '08',
 '10',
 '11',
 '12',
 '13',
 '14',
 '14chassidy',
 '15',
 '16',
 '17',
 '20',
 '2015',
 '21491503',
 '21492074',
 '270',
 '28',
 '2nd',
 '6964',
 '784',
 '800',
 'able',
 'about',
 'action',
 'add',
 'added',
 'advised',
 'afternoon',
 'also',
 'an',
 'and',
 'andno',
 'answered',
 'are21491465',
 'around',
 'asking',
 'at',
 'avery',
 'awaiting',
 'back',
 'bal',
 'be',
 'beam',
 'because',
 'beenon',
 'both',
 'brian',
 'busted',
 'but',
 'call',
 'called',
 'calling',
 'can',
 'cancelled',
 'charge',
 'chassidy',
 'checking',
 'clearance',
 'clopay',
 'coming',
 'company',
 'cont',
 'contact',
 'cos',
 'cost',
 'could',
 'created',
 'creating',
 'cust',
 'customer',
 'd30',
 'days',
 'dead',
 'different',
 'do',
 'door',
 'drill',
 'due',
 'ella',
 'email',
 'emaill',
 'error',
 'extra',
 'feet',
 'find',
 'finding',
 'first',
 'for',
 'from',
 'garage',
 'gdo',
 'gdos',
 'get',
 'go',
 'going',
 'had',
 'had2nd',
 'have',
 'having',
 'hd',
 'he',
 'head'

In [24]:
# examine the document-term matrix
X_train_dtm

<1x229 sparse matrix of type '<class 'numpy.int64'>'
	with 229 stored elements in Compressed Sparse Row format>

In [25]:
# convert sparse matrix to a dense matrix
X_train_dtm.toarray()

array([[ 3,  3,  1,  2,  2, 10,  5,  7,  1,  1,  5,  3,  3,  1,  1,  2,  1,
         1,  1,  5,  1,  1,  1,  1,  3,  1,  2,  1,  1,  1,  2,  1, 10,  2,
         2,  2,  1,  1,  3,  2,  1,  4,  1,  9,  8,  2,  2,  2,  1,  1,  3,
         7,  4,  2,  1,  2,  6,  6,  2,  1,  2,  2,  1,  2,  1,  2,  1,  2,
         2,  1,  2,  7,  8,  5,  2,  1,  2,  4,  1,  2,  8,  2,  1,  1,  1,
         1,  1,  1,  2,  9,  8,  5,  6,  2,  1,  1,  1,  3,  2,  2,  1,  5,
         2,  3, 12,  1,  3,  1,  1,  3,  8,  1,  2,  3, 11,  1,  1,  5,  1,
         2,  1,  1,  3,  1,  3,  1,  3,  2,  4,  2,  2,  2,  1,  3,  1,  1,
         1,  4,  1,  2,  2,  1,  3,  1,  1,  1,  3,  1,  3,  1,  9,  3,  1,
         2,  1,  2,  1,  2,  1,  1,  2,  3,  3,  6,  2,  2,  1,  1,  8,  2,
         2,  1,  1,  3,  1,  1,  1,  3,  2,  2,  4,  2,  2,  6,  2,  1,  1,
         1,  3,  3,  1,  8,  7,  1,  3,  3,  4, 11, 37,  5,  1,  1,  1,  1,
        32,  2,  2,  1,  3,  5,  2,  1,  1,  2,  3,  3,  1,  2,  1,  1,  7,
         3, 

In [26]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,01,02,03,08,10,11,12,13,14,14chassidy,...,which,will,with,work,working,would,wouldn,xml,yesterday,you
0,3,3,1,2,2,10,5,7,1,1,...,1,7,3,1,1,6,1,7,2,3


In [27]:
# transform testing data (using fitted vocabulary) into a document-term matrix
# Notice, of course, that you are not doing a "fit" here since it is creating a DTM for the Test data set
X_test_dtm = vect.transform(X_test)
#X_test_dtm
#print(X_test_dtm)

## Part 5: Building a Naive Bayes model

We will use [Multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html):

> The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [34]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [35]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

Wall time: 4 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [36]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [37]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.98851399856424982

In [38]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [39]:
# print message text for the false positives (meaning they were incorrectly classified as spam)
X_test[y_test < y_pred_class]

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [40]:
# print message text for the false negatives (meaning they were incorrectly classified as ham)
X_test[y_test > y_pred_class]

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object

In [41]:
# what do you notice about the false negatives?
X_test[3132]

"LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323."

In [42]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([  2.87744864e-03,   1.83488846e-05,   2.07301295e-03, ...,
         1.09026171e-06,   1.00000000e+00,   3.98279868e-09])

In [43]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.98664310005369604

## Part 6: Comparing Naive Bayes with logistic regression

In [44]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [45]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

Wall time: 50 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [47]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([ 0.01269556,  0.00347183,  0.00616517, ...,  0.03354907,
        0.99725053,  0.00157706])

In [48]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.9877961234745154

In [49]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.99368176123143015

## Part 7: Calculating the "spamminess" of each token

In [50]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

7456

In [51]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

[u'00', u'000', u'008704050406', u'0121', u'01223585236', u'01223585334', u'0125698789', u'02', u'0207', u'02072069400', u'02073162414', u'02085076972', u'021', u'03', u'04', u'0430', u'05', u'050703', u'0578', u'06', u'07', u'07008009200', u'07090201529', u'07090298926', u'07123456789', u'07732584351', u'07734396839', u'07742676969', u'0776xxxxxxx', u'07781482378', u'07786200117', u'078', u'07801543489', u'07808', u'07808247860', u'07808726822', u'07815296484', u'07821230901', u'07880867867', u'0789xxxxxxx', u'07946746291', u'0796xxxxxx', u'07973788240', u'07xxxxxxxxx', u'08', u'0800', u'08000407165', u'08000776320', u'08000839402', u'08000930705']


In [52]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

[u'yer', u'yes', u'yest', u'yesterday', u'yet', u'yetunde', u'yijue', u'ym', u'ymca', u'yo', u'yoga', u'yogasana', u'yor', u'yorge', u'you', u'youdoing', u'youi', u'youphone', u'your', u'youre', u'yourjob', u'yours', u'yourself', u'youwanna', u'yowifes', u'yoyyooo', u'yr', u'yrs', u'ything', u'yummmm', u'yummy', u'yun', u'yunny', u'yuo', u'yuou', u'yup', u'zac', u'zaher', u'zealand', u'zebra', u'zed', u'zeros', u'zhong', u'zindgi', u'zoe', u'zoom', u'zouk', u'zyada', u'\xe8n', u'\u3028ud']


In [53]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

array([[  0.,   0.,   0., ...,   1.,   1.,   1.],
       [  5.,  23.,   2., ...,   0.,   0.,   0.]])

In [54]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

(2L, 7456L)

In [55]:
# number of times each token appears across all HAM messages
ham_token_count = nb.feature_count_[0, :]
ham_token_count

array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [56]:
# number of times each token appears across all SPAM messages
spam_token_count = nb.feature_count_[1, :]
spam_token_count

array([  5.,  23.,   2., ...,   0.,   0.,   0.])

In [57]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'ham':ham_token_count, 'spam':spam_token_count}).set_index('token')

In [58]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
very,64,2
nasty,1,1
villa,0,1
beloved,1,0
textoperator,0,2


Before we can use this to calculate the **"spamminess" of each token**, we need to avoid **dividing by zero** and account for the **class imbalance**.

In [59]:
# add 1 to ham and spam counts to avoid dividing by 0
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
very,65,3
nasty,2,2
villa,1,2
beloved,2,1
textoperator,1,3


In [60]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([ 3617.,   562.])

In [61]:
# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
very,0.017971,0.005338
nasty,0.000553,0.003559
villa,0.000276,0.003559
beloved,0.000553,0.001779
textoperator,0.000276,0.005338


In [62]:
# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
very,0.017971,0.005338,0.297044
nasty,0.000553,0.003559,6.435943
villa,0.000276,0.003559,12.871886
beloved,0.000553,0.001779,3.217972
textoperator,0.000276,0.005338,19.307829


In [63]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claim,0.000276,0.158363,572.798932
prize,0.000276,0.135231,489.131673
150p,0.000276,0.087189,315.361210
tone,0.000276,0.085409,308.925267
guaranteed,0.000276,0.076512,276.745552
18,0.000276,0.069395,251.001779
cs,0.000276,0.065836,238.129893
www,0.000553,0.129893,234.911922
1000,0.000276,0.056940,205.950178
awarded,0.000276,0.053381,193.078292


In [64]:
# look up the spam_ratio for a given token
tokens.loc['dating', 'spam_ratio']

83.667259786476862

## Part 8: Creating a DataFrame from individual text files

In [65]:
# use glob to create a list of ham filenames
import glob
ham_filenames = glob.glob('../data/ham_files/*.txt')
ham_filenames

['../data/ham_files\\email1.txt',
 '../data/ham_files\\email3.txt',
 '../data/ham_files\\email5.txt']

In [66]:
# read the contents of the ham files into a list (each list element is one email)
ham_text = []
for filename in ham_filenames:
    with open(filename) as f:
        ham_text.append(f.read())
ham_text

['This is a ham email.\nIt has 2 lines.\n',
 'This is another ham email.\n',
 'This is yet another ham email.\n']

In [67]:
# repeat this process for the spam files
spam_filenames = glob.glob('../data/spam_files/*.txt')
spam_text = []
for filename in spam_filenames:
    with open(filename) as f:
        spam_text.append(f.read())
spam_text

['This is a spam email.\n', 'This is another spam email.\n']

In [68]:
# combine the ham and spam lists
all_text = ham_text + spam_text
all_text

['This is a ham email.\nIt has 2 lines.\n',
 'This is another ham email.\n',
 'This is yet another ham email.\n',
 'This is a spam email.\n',
 'This is another spam email.\n']

In [69]:
# create a list of labels (ham=0, spam=1)
all_labels = [0]*len(ham_text) + [1]*len(spam_text)
all_labels

[0, 0, 0, 1, 1]

In [70]:
# convert the lists into a DataFrame
pd.DataFrame({'label':all_labels, 'message':all_text})

Unnamed: 0,label,message
0,0,This is a ham email.\nIt has 2 lines.\n
1,0,This is another ham email.\n
2,0,This is yet another ham email.\n
3,1,This is a spam email.\n
4,1,This is another spam email.\n
