In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/drive/My Drive/SpamSMS.csv',encoding = 'latin-1')

In [None]:

df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.drop(labels = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis = 1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='v1', data=df)
plt.xlabel('SMS Classification')
plt.ylabel('Count')

Data Cleaning


In [None]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
### cleaning the SMS ###

corpus = []
ps = PorterStemmer()

for i in range(0,df.shape[0]) : 
  #cleaning the special characters
  v2 = re.sub(pattern = '[^a-zA-Z]',repl=' ',string=df.v2[i])
  #converting the entire sms to lower case
  v2 = v2.lower()
  #Tokenizing the review by words
  words = v2.split()
  #removing the stop words
  words = [word for word in words if word not in set(stopwords.words('english'))]
  #stemming the words
  words = [ps.stem(word) for word in words]
  #joining the stemmed words
  v2 = ' '.join(words)
  #building a corpus of sms
  corpus.append(v2)

In [None]:
corpus[0:10]

In [None]:
### creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()

In [None]:
### extracting depending variable from the dataset
y = pd.get_dummies(df['v1'])
y = y.iloc[:, 1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifire = MultinomialNB()
classifire.fit(X_train,y_train)

In [None]:
y_pred = classifire.predict(X_test)

In [None]:
###Accuracy,Precission and Recall
from sklearn.metrics import accuracy_score,precision_score,recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3 = recall_score(y_test,y_pred)
print("Accuracy Score is : {}%".format(round(score1*100,2)))
print("Precision Score is : {}%".format(round(score2,2)))
print("Recall Score is : {}%".format(round(score3,2)))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

In [None]:
cm

In [None]:
### plotting of confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm,annot=True)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')

In [None]:
### Hyperparameter tunning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.0,1.1,0.1) : 
  temp_classifier = MultinomialNB(alpha=i)
  temp_classifier.fit(X_train,y_train)
  temp_y_pred = temp_classifier.predict(X_test)
  score = accuracy_score(y_test,temp_y_pred)
  print("Accuracy score for alpha= {} is: {}%".format(round(i,1),round(score*100,2)))
  if score > best_accuracy :
    best_accuracy = score
    alpha_val = i
print("The best accuracy is {}% with alpha value {}".format(round(best_accuracy*100,2),round(alpha_val,1)))

In [None]:
classifire = MultinomialNB(alpha = 0.8)
classifire.fit(X_train,y_train)

Predictions


In [None]:
def predict_spam(sample_sms) :
  sample_sms = re.sub(pattern='[a-zA-Z_]',repl=' ',string=sample_sms)
  sample_sms = sample_sms.lower()
  sample_sms_words = sample_sms.split()
  sample_sms_words = [word for word in sample_sms_words if not word in set(stopwords.words('english'))]
  ps = PorterStemmer()
  final_sms = [ps.stem(word) for word in sample_sms_words]
  final_sms = ' '.join(final_sms)
  temp = cv.transform([final_sms]).toarray()
  return classifire.predict(temp)

In [None]:
#### predicting values

sample_sms = 'Hiii!!!Ananya this side.'
if predict_spam(sample_sms) : 
  print("This is a SPAM SMS!")
else : 
  print("This is a normal SMS!")


In [None]:
sample_sms = 'IMPORTANT - you can win $100 by taking part in this mega event!!To  start the process please reply YES or to opt out text STOP'
if predict_spam(sample_sms) : 
  print("This is a SPAM sms!")
else :
  print("This is a normal sms!")


In [79]:
sample_message = 'You have still not claimed the compensation you are due for the accident you had. To start the process please reply YES. To opt out text STOP.'

if predict_spam(sample_message):
  print('This is a SPAM message!')
else:
  print('This is a normal message.')

This is a normal message.
