# Imports

In [7]:
import pandas as pd
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Analze the data
## here the data is classified as either a spam email or not

In [9]:
schema = df.dtypes
print(schema)

Category    object
Message     object
dtype: object


In [6]:
df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


# add a new column for spam

In [31]:
df["spam"] = df["Category"].apply(lambda x:1 if x == "spam" else 0)

# Split the dataset into train and test

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.Message,df.spam,test_size=0.25)

# Vectorize the email text into numbers

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
v= CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# applying multinomial naive bayes to the dataset

In [20]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)

# predicting if an incoming email is a spam or not 
## here the first and the third email is a spam

In [24]:
emails = [

    "Hi Arun, lets go out for a walk",
    "Upto 20% discount on shopping clothes",
    "Free ticket to Australia"
]
email_count = v.transform(emails)
model.predict(email_count)

array([0, 1, 1], dtype=int64)

# checking the accuracy

In [25]:
X_test.count = v.transform(X_test)
model.score(X_test.count,y_test)

0.9842067480258435

# An alternate approach of acheiving the above using sk learn pipeline

In [27]:
from sklearn.pipeline import Pipeline
clf = Pipeline(
    [
        ('vector',CountVectorizer()),
        ('nb',MultinomialNB())
    ]
)

In [28]:
clf.fit(X_train,y_train)

In [29]:
clf.predict(emails)

array([0, 1, 1], dtype=int64)

In [30]:
clf.score(X_test,y_test)

0.9842067480258435