In [None]:
# Create a ml model and create a webapp to predict if a message entered is spam or not spam

# create the same model using 4 different methods 
# 1. svm 
# 2. svm + pipeline
# 3. naive bayes 
# 4. naive bayes + pipeline 

# we have to use the model which gives best accuracy 

In [None]:
import pandas as pd

In [None]:
df = pd.read_table("https://raw.githubusercontent.com/arib168/data/main/spam.tsv")          
df

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   length   5572 non-null   int64 
 3   punct    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.2+ KB


In [None]:
# x = df.iloc[:,1].values
x = df['message'].values #for numerical data, input x should be in 2 dimensions, for text data, it is 1 dimension only

y = df['label'].values
# y = df.iloc[:,0].values

In [None]:
df['message'][5567]

'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

In [None]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

In [None]:
x_train.shape

(4179,)

In [None]:
x_test.shape

(1393,)

In [None]:
# tokenization - splitting the sentence into words
# vectorization - after splitting, counting how many times each word has been repeated 

In [None]:
#apply the feature extraction technique using the count vectorizer/bag of words

from sklearn.feature_extraction.text import CountVectorizer                                                                                                         
vect = CountVectorizer(stop_words='english')
x_train_vect = vect.fit_transform(x_train)
x_test_vect = vect.transform(x_test)

In [None]:
#METHOD 1
from sklearn.svm import SVC 
model1 = SVC()

In [None]:
model1.fit(x_train_vect,y_train)  #we need x and y for fitting the model

SVC()

In [None]:
y_pred1 = model1.predict(x_test_vect)
y_pred1 #predicted value

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [None]:
y_test  #actual value

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype=object)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred1,y_test)

0.9813352476669059

In [None]:
# METHOD 2
# using the sklearn pipeline for svc
# pipeline is used to combine two estimators/two different processes together 
# The two estimators combined here are SVC and CountVectorizer(for the pipeline)
#      Why should we use the pipeline?
# Ans:- it removes the need for us to do fit and transform individually

In [None]:
# fit and transform both are done simultaneously if we use the pipeline to make things easy 

from sklearn.pipeline import make_pipeline
model2 = make_pipeline(CountVectorizer(),SVC())
model2.fit(x_train,y_train)

y_pred2 = model2.predict(x_test)
y_pred2

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [None]:
y_test

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype=object)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred2,y_test)

0.9834888729361091

In [None]:
# METHOD 3 Using Naive Bayes classifier 
from sklearn.naive_bayes import MultinomialNB
model3 = MultinomialNB()

In [None]:
model3.fit(x_train_vect,y_train)

MultinomialNB()

In [None]:
y_pred3 = model3.predict(x_test_vect)
y_pred3

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype='<U4')

In [None]:
y_test

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype=object)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred3,y_test)

0.9863603732950467

In [None]:
#METHOD 4 - MultinomialNB pipeline

In [None]:
from sklearn.pipeline import make_pipeline  
model4 = make_pipeline(CountVectorizer(),MultinomialNB())
model4.fit(x_train,y_train)
y_pred4 = model4.predict(x_test)
y_pred4

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype='<U4')

In [None]:
y_test

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype=object)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred4,y_test)

0.9885139985642498

In [None]:
# SVC Model Accuracy                  - 0.9813352476669059
# SVC pipeline Accuracy               - 0.9834888729361091
# MultinomialNB Model Accuracy        - 0.9863603732950467
# MultinomialNB pipeline Accuracy     - 0.9885139985642498

In [None]:
# model persistance (pickle-multilinear regression )
# serializationa and deserialization steps

In [None]:
# we are going to persist with the best model out of the 4 models created (i.e - use the model with the highest accuracy)

In [None]:
#serialization
import joblib 
joblib.dump(model4,'spam-ham') # a file is created 

['spam-ham']

In [None]:
#desrialization
import joblib 
text_model = joblib.load('spam-ham')

In [None]:
text_model.predict(["free tickets sold"])   #model prediction of the output

array(['spam'], dtype='<U4')

In [None]:
# CREATING THE WEB APPLICATION USING STREAMLIT FOR THE SPAM HAM PREDICTION 

In [None]:
pip install streamlit --quiet

In [None]:
import streamlit as st

In [None]:
%%writefile demo.py 
import streamlit as st 

st.title("HELLO WORLD!")

Writing demo.py


In [None]:
!streamlit run demo.py &npx localtunnel --port 8501

2022-06-03 16:56:13.263 INFO    numexpr.utils: NumExpr defaulting to 2 threads.
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://34.125.62.243:8501[0m
[0m
[K[?25hnpx: installed 22 in 4.478s
your url is: https://solid-teams-prove-34-125-62-243.loca.lt
[34m  Stopping...[0m
^C


In [None]:
%%writefile app.py 
import streamlit as st 
import joblib 

st.title("SPAM HAM CLASSIFIER")   #title for the webapp
text_model = joblib.load('/content/spam-ham') #loading the joblib model to use for predicting the output 
ip = st.text_input("Enter the message :")     #Input message 
op = text_model.predict([ip])                 # use the model for predicting the output
if st.button('PREDICT'):                      # create a button called as predict, and if that button is clicked, then display the output 
  st.title(op[0])  #print the output 

Writing app.py


In [None]:
!streamlit run app.py &npx localtunnel --port 8501

2022-06-03 16:56:26.599 INFO    numexpr.utils: NumExpr defaulting to 2 threads.
[K[?25hnpx: installed 22 in 2.204s
your url is: https://calm-memes-like-34-125-62-243.loca.lt
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://34.125.62.243:8501[0m
[0m
[34m  Stopping...[0m
^C
