In [None]:
# example text for model training
simple_train = ['call you tonight', 'Call me a cab', 'please call me.. please']

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

#  instantiate CountVectorizer (vectorizer)
vect = CountVectorizer()

In [None]:
# 3. fit
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

In [None]:
# examine the fitted vocabulary
vect.get_feature_names()

In [None]:
# 4. transform training data into a 'document-term matrix','sparse matrix'
simple_train_dtm = vect.transform(simple_train)
print(simple_train_dtm)

In [None]:
simple_train_dtm.toarray()

In [None]:
print('sparse matrix')
print(simple_train_dtm)

print('dense matrix')
print(simple_train_dtm.toarray())

In [None]:
import pandas as pd
from IPython.display import HTML
HTML("""
<style>
h1,h2,h3 {
	margin: 1em 0 0.5em 0;
	font-weight: 600;
	font-family: 'Titillium Web', sans-serif;
	position: relative;  
	font-size: 36px;
	line-height: 40px;
	padding: 15px 15px 15px 2.5%;
	color: #1E8449;
	box-shadow: 
		inset 0 0 0 1px rgba(246,38,100, 1), 
		inset 0 0 5px rgba(246,3,100, 1),
		inset -285px 0 35px #D5F5E3;
	border-radius: 0 10px 0 15px;
	background: #fff
    
}
</style>
""")
# examine the vocabulary and document-term matrix together
# pd.DataFrame(matrix, columns=columns)
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
#reading data
import warnings as wr
wr.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

sms=pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")
sms.head()

In [None]:
sms.shape

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(sms.isnull(),yticklabels=False,cbar=True,cmap='mako')

In [None]:
sms.isnull().sum()

In [None]:
sms=sms.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)
sms.head()

In [None]:
# examine the class distribution
sms.v1.value_counts()

In [None]:

sns.countplot(sms["v1"])

In [None]:
# convert label to a numerical variable

sms['v1'] = sms.v1.map({'ham':0, 'spam':1})

In [None]:
sms.head()

In [None]:
ham=sms[sms["v1"]==0]
spam=sms[sms["v1"]==1]

In [None]:
ham.shape,spam.shape

In [None]:
ham=ham.sample(spam.shape[0])
ham.shape

In [None]:
data=spam.append(ham,ignore_index=True)
print("Shape :",data.shape)
data.head()

In [None]:
sns.countplot(data["v1"])

In [None]:
X = data.v2
y = data.v1

# Model building

In [None]:
# split X and y into training and testing sets
# by default, it splits 75% training and 25% test
# random_state=1 for reproducibility

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
vect = CountVectorizer()

In [None]:
X_train_dtm = vect.fit_transform(X_train)

In [None]:
X_train_dtm

In [None]:
# 4. transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

# you can see that the number of columns, 7456, is the same as what we have learned above in X_train_dtm

In [None]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [None]:
# 3. train the model 

nb.fit(X_train_dtm, y_train)

In [None]:
# 4. make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [None]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:

cf_matrix=metrics.confusion_matrix(y_test, y_pred_class)
cf_matrix

In [None]:
import matplotlib.pyplot as plt
#Ploting confusion matrix
plt.figure(figsize=(8,5))
sns.heatmap(cf_matrix, annot=True, fmt='d')

In [None]:
# print message text for the false positives (ham incorrectly classified as spam)

X_test[(y_pred_class==1) & (y_test==0)]

In [None]:
# print message text for the false negatives (spam incorrectly classified as ham)
X_test[(y_pred_class==0) & (y_test==1)]

In [None]:
# calculate AUC
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
metrics.roc_auc_score(y_test, y_pred_prob)

# LogisticRegression

In [None]:
# 1. import
from sklearn.linear_model import LogisticRegression

# 2. instantiate a logistic regression model
logreg = LogisticRegression()

In [None]:
# 3. train the model using X_train_dtm
logreg.fit(X_train_dtm, y_train)

In [None]:
# 4. make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

In [None]:
# remove English stop words
vect1 = CountVectorizer(stop_words='english')

X_train_1 = vect1.fit_transform(X_train)

X_train_1


In [None]:
# include 1-grams and 2-grams

# how to differentiate between "Happy", "Not Happy", "Very Happy"
vect2 = CountVectorizer(ngram_range=(1, 2))

X_train_2 = vect2.fit_transform(X_train)

X_train_2

In [None]:
# ignore terms that appear in more than 50% of the documents
vect3 = CountVectorizer(max_df=0.5)

X_train_3 = vect3.fit_transform(X_train)

X_train_3

In [None]:
# only keep terms that appear in at least 2 documents
vect4 = CountVectorizer(min_df=2)

X_train_4 = vect4.fit_transform(X_train)

X_train_4

In [None]:
vect_combined= CountVectorizer(stop_words='english',ngram_range=(1, 2),min_df=2,max_df=0.5)

In [None]:
X_train_c = vect_combined.fit_transform(X_train)
X_test_c = vect_combined.transform(X_test)

X_train_c

In [None]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

nb.fit(X_train_c, y_train)

y_pred_class = nb.predict(X_test_c)

nb_cf_matrix=metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(nb_cf_matrix, annot=True, fmt='d')

### Thanks