In [1]:
import pandas as pd
import numpy as np
sms_data=pd.read_csv("sms_raw_NB.csv",encoding='ISO-8859-1')
sms_data.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...


In [2]:
sms_data.shape

(5559, 2)

In [3]:
sms_data.groupby('type').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4812,4503,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [4]:
sms_data.type.value_counts()

ham     4812
spam     747
Name: type, dtype: int64

In [5]:
#here 'type' is our target variable.. so first convert the type variable in to numerical format

sms_data["type_N"]=sms_data.type.apply(lambda x:1 if x=='spam' else 0)

In [6]:
sms_data.head()

Unnamed: 0,type,text,type_N
0,ham,Hope you are having a good week. Just checking in,0
1,ham,K..give back my thanks.,0
2,ham,Am also doing in cbe only. But have to pay.,0
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000...",1
4,spam,okmail: Dear Dave this is your final notice to...,1


In [30]:
#define X and y (from the SMS data) for use with COUNTVECTORIZER
X=sms_data.text
y=sms_data.type_N

In [31]:
X.head()

0    Hope you are having a good week. Just checking in
1                              K..give back my thanks.
2          Am also doing in cbe only. But have to pay.
3    complimentary 4 STAR Ibiza Holiday or å£10,000...
4    okmail: Dear Dave this is your final notice to...
Name: text, dtype: object

In [10]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4169,)
(1390,)
(4169,)
(1390,)


# Vectorizing dataset

 use CountVectorizer to "convert text into a matrix of token counts"

In [11]:
# import  CountVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
vect=CountVectorizer()

In [12]:
# learn the 'vocabulary' of the training data
vect.fit(X_train)

CountVectorizer()

In [13]:
## transform training data into a 'document-term matrix'
X_train_dtm=vect.transform(X_train)
X_train_dtm

<4169x7453 sparse matrix of type '<class 'numpy.int64'>'
	with 54759 stored elements in Compressed Sparse Row format>

In [14]:
# convert sparse matrix to a dense matrix
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Building and evaluating a model

In [15]:
from sklearn.naive_bayes import MultinomialNB 
model=MultinomialNB()

model.fit(X_train_dtm,y_train)

MultinomialNB()

In [16]:
# make class predictions for X_test_dtm

#first convert test data into document term matrix
X_test_dtm=vect.transform(X_test)


y_pred=model.predict(X_test_dtm)
print(y_pred)

[0 0 0 ... 0 1 0]


In [17]:
## calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test,y_pred)

0.983453237410072

In [18]:
## print the confusion matrix
metrics.confusion_matrix(y_test,y_pred)

array([[1185,    5],
       [  18,  182]], dtype=int64)

# IMBALANCE DATA HANDLING

Imbalace data is a common problems in classification problem, Imbalance data can affect our model accuracy big time.

in our problen we have to classify ham text and spam text

here ham =4812  and spam=747 , in this case data is imbalance, large number of ham data, less number of spam data. 

However, if the data set in imbalance then In such cases, we get a pretty high accuracy just by predicting the majority class, but  fail to capture the minority class

so here we have to apply different mechanism to handle imbalance data and improve the overall accuracy

A widely adopted technique for dealing with highly unbalanced datasets is called resampling. It consists of removing samples from the majority class (under-sampling) and/or adding more examples from the minority class (over-sampling)

In [33]:
sms_data.type.value_counts()

ham     4812
spam     747
Name: type, dtype: int64

In [34]:
vect.fit(X)

## transform data into a 'document-term matrix'
X_dtm=vect.transform(X)
X_dtm

<5559x8698 sparse matrix of type '<class 'numpy.int64'>'
	with 73387 stored elements in Compressed Sparse Row format>

AttributeError: 'int' object has no attribute 'lower'

In [21]:
#Under sampling technique using NearMiss
#NearMiss is an under-sampling technique. Instead of resampling the Minority class, using a distance, 
#this will make the majority class equal to the minority class.

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [36]:
smk = SMOTETomek(random_state=42)
X_new,y_new=smk.fit_sample(X,y)

ValueError: could not convert string to float: 'Hope you are having a good week. Just checking in'

In [40]:
X_train_dtm

<4169x7453 sparse matrix of type '<class 'numpy.int64'>'
	with 54759 stored elements in Compressed Sparse Row format>