# Spam Filtering

<p>Importing the required libraries</p>

In [1]:
import pandas as pd # to read and manipulate the data 
import numpy as np # to do mathematical operations
import re
import string


# scikit-learn to do preprocessing and processing of data
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

### Reading the spam.csv file

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1') # reading the csv file

In [3]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
data.shape # no.of rows and columns in the data

(5572, 5)

### Dropping unnecessary columns and keeping only 'text' and 'label'


In [5]:
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

In [6]:
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Pre-Processing the text data
    - Lowercasing
    - removing numbers, puctuations and special characters and white spaces

In [7]:
def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'\d+', '', text) # regular expresion to match pattern(digits) and replace the string
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = text.strip()
    return text


In [8]:
data['text'] = data['text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(preprocess_text)


In [9]:
data

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the nd time we have tried contact u u...
5568,ham,will ì b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggest...
5570,ham,the guy did some bitching but i acted like id ...


### Converting the Label to spam:1 and ham:0
#### Endocing the Labels

In [10]:
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].map({'spam': 1, 'ham': 0})


In [11]:
data

Unnamed: 0,label,text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,1,this is the nd time we have tried contact u u...
5568,0,will ì b going to esplanade fr home
5569,0,pity was in mood for that soany other suggest...
5570,0,the guy did some bitching but i acted like id ...


### Spliting the dataset to training and testing using scikit-learn
80% training data and 20% testing data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [13]:
X_train

1978    no im in the same boat still here at my moms c...
3989    bank of granite issues strongbuy explosive pic...
3935        they r giving a second chance to rahul dengra
4078             o i played smash bros  ltgt  religiously
4086    private your  account statement for  shows  un...
                              ...                        
3772    i came hostel i m going to sleep plz call me u...
5191                                 sorry ill call later
5226                prabhaim sorydarealyfrm heart im sory
5390                           nt joking seriously i told
860                     in work now going have in few min
Name: text, Length: 4457, dtype: object

In [14]:
X_train.shape

(4457,)

In [15]:
y_train.shape

(4457,)

In [16]:
X_test.shape

(1115,)

In [17]:
y_test.shape

(1115,)

# Vectorize the text data (Bag of Words)
- Vectorization transforms text into numerical vectors that represent the content in a format that models can understand.
- <b>Bag of Words (BoW):</b> This method creates a vocabulary of all unique words in the text corpus and represents each document as a vector of word counts (or binary values indicating the presence of words).

In [18]:
count_vectorizer = CountVectorizer(stop_words='english')
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Transform counts to frequencies using TF-IDF
- <b>Term Frequency-Inverse Document Frequency (TF-IDF):</b> This method not only counts the occurrences of words but also adjusts these counts based on how common or rare the words are across all documents. This helps to highlight words that are more significant in distinguishing between documents.

In [19]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Train the data on Naive Bayes classifier(classifing spam and ham)

In [20]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)


# Make predictions on the test set


In [21]:
y_pred = nb_classifier.predict(X_test_tfidf)


In [22]:
y_pred

array([0, 0, 0, ..., 0, 0, 1])

# Evaluate the model


In [24]:
print("Confusion Matrix:\n",confusion_matrix(y_test, y_pred))
print("\nClassification Report:",classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test,y_pred))

Confusion Matrix:
 [[965   0]
 [ 35 115]]

Classification Report:               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Accuracy: 0.968609865470852
