## Spam Emails check

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("C:\\Users\\dhruv\\Downloads\\spam.csv")
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('Category').describe() # to know how many is spam and ham

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


### convert string into int for better computation

In [4]:
df['Spam']=df['Category'].apply(lambda x : 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


#### training data

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.Message,df.Spam, test_size=0.3)

### now to convert Message col we have to use CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer() #convert text into array
X_train_count=v.fit_transform(X_train.values)
X_train_count.toarray()[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### multinomial alg

In [7]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train_count,y_train)

#### testing time

In [8]:
emails=[
    'Dhruv, Last Day to Unlock Your Early Entry & Win INR 10K',
    'Data Scientist”: Thermo Fisher Scientific - Scientist III, Data Sciences'
]

In [9]:
email_count=v.transform(emails) #step1 : convert into int by v(count_vectorizer)
model.predict(email_count)      #step2 : predict by Multinominal 

array([1, 0])

### 1 stands for Span and 0 for ham

#### for more convenient we are using pipeline

In [10]:
from sklearn.pipeline import Pipeline
clf=Pipeline([
    ('Vectorizer', CountVectorizer()),
    ('NB', MultinomialNB())
])


In [11]:
clf

In [12]:
clf.fit(X_train, y_train)
clf.score(X_test,y_test)

0.9856459330143541

In [13]:
clf.predict(emails)

array([1, 0])

In [14]:
import joblib

# Save the trained pipeline to a file (this includes both the vectorizer and the model)
joblib.dump(clf, 'spam_model_pipeline.pkl')

['spam_model_pipeline.pkl']