In [None]:
import sklearn
from sklearn.gaussian_process import GaussianProcessRegressor


In [1]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'

sms = pd.read_table(url, header=None, names=['label', 'message'])
sms.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
X = sms.message

y = sms.label_num
print(X.shape)
print(y.shape)
    

(5572,)
(5572,)


In [3]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

        

(4179,)
(1393,)
(4179,)
(1393,)


In [4]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm
        

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [5]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
X_test_dtm = vect.transform(X_test)
X_test_dtm


<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [6]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
y_pred_class


array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [7]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)
            

0.9885139985642498

In [8]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
metrics.confusion_matrix(y_test, y_pred_class)
        

array([[1203,    5],
       [  11,  174]], dtype=int64)

In [9]:

#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
X_test[y_test < y_pred_class]


574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [10]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
X_test[y_test > y_pred_class]
    

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object

In [11]:
X_test[(y_test==0) & (y_pred_class==1)]
    


574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [12]:
X_test[(y_test==1) & (y_pred_class==0)]


3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object

In [14]:
X_new = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C\\'s apply 08452810075over18\\'s', 'I HAVE A DATE ON SUNDAY WITH WILL!!"]
X_new_dtm = vect.transform(X_new)
nb.predict(X_new_dtm)


array([1], dtype=int64)

In [19]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)
X_train_tokens[-50:]
X_train_tokens[20010:20030]
nb.feature_count_
print(nb.feature_count_.shape)
ham_token_count = nb.feature_count_[0, :]
spam_token_count = nb.feature_count_[1, :]
tokens = pd.DataFrame({'token':X_train_tokens, 'ham':ham_token_count, 'spam':spam_token_count}).set_index('token')
print(tokens.head())
tokens.sample(5, random_state=6)
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens.sample(5, random_state=6)
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5, random_state=6)
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=6)
tokens.sort_values('spam_ratio', ascending=False)
tokens.sort_values('spam_ratio', ascending=True)

tokens.loc['claim', 'spam_ratio']
tokens.loc['prize', 'spam_ratio']
tokens.loc['tone', 'spam_ratio']
tokens.loc['guaranteed', 'spam_ratio']
tokens.loc['free', 'spam_ratio']
tokens.loc['text', 'spam_ratio']
tokens.loc['mobile', 'spam_ratio']
# tokens.loc['reply', 'spam_ratio']


(2, 7456)
              ham  spam
token                  
00            0.0   5.0
000           0.0  23.0
008704050406  0.0   2.0
0121          0.0   1.0
01223585236   0.0   1.0


58.5670818505338

In [24]:
tokens.to_excel('tokens.xlsx', encoding='utf-8')

In [3]:
#https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'

sms = pd.read_table(url, header=None, names=['label', 'message'])
sms.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
