# Spam Classification

### Importing the libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score

### Loading dataset 

In [7]:
data = pd.read_csv('spam.csv',encoding='latin-1')

### Data exploration

In [11]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [13]:
data.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [15]:
data = data[['v1','v2']]

In [17]:
data.columns = ['label','message']

### Replaceing the numarical value for label

In [20]:
data.replace({'label':{'ham':0,'spam':1}},inplace=True)

  data.replace({'label':{'ham':0,'spam':1}},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.replace({'label':{'ham':0,'spam':1}},inplace=True)


In [22]:
data

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


### Using Re

In [25]:
import re

In [27]:
def clean_text(message):
    message = message.lower()
    message = re.sub(r'[^\w\s]','',message)
    message = re.sub(r'\d+','',message)
    return message

In [29]:
data['message'] = data['message'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['message'] = data['message'].apply(clean_text)


In [31]:
data

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,1,this is the nd time we have tried contact u u...
5568,0,will ì_ b going to esplanade fr home
5569,0,pity was in mood for that soany other suggest...
5570,0,the guy did some bitching but i acted like id ...


### Vectorization

In [34]:
vector = CountVectorizer(stop_words='english')

### Splitting into feature and target data

In [37]:
X = vector.fit_transform(data['message'])

In [38]:
y = data['label']

In [41]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Implementation of algorithm

In [44]:
nb = MultinomialNB()

In [46]:
nb.fit(X_train,y_train)

In [48]:
y_pred = nb.predict(X_test)

In [50]:
y_pred

array([1, 0, 1, ..., 0, 0, 1], dtype=int64)

### Accuracy Score

In [53]:
accuracy = accuracy_score(y_test,y_pred)

In [55]:
accuracy

0.9704035874439462

### Confusion Matrix

In [58]:
cm = confusion_matrix(y_test,y_pred)

In [60]:
cm

array([[945,  20],
       [ 13, 137]], dtype=int64)