## Importing dependencies

In [1]:
import pandas as pd # for data interpretation
import numpy as np # for math operations
import matplotlib.pyplot as plt # for data representation if needed
import re # regular expression for searching texts in paragraphs
from nltk.corpus import stopwords # stopwords is a collection of useless words that our data might not really need 
from nltk.stem.porter import PorterStemmer # stem the word and only keeps the root word or the important part of the word
from sklearn.feature_extraction.text import TfidfVectorizer # vectorizes the texts on the basis of frequency of the time they are used 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download("stopwords")
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayushmalaviya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv("mail_data.csv")
df.head()
df.shape

(5572, 2)

In [20]:
df["Category"] = df["Category"].map({"spam":1 , "ham":0})
df.head()
df.isnull().sum()


Category
0    4825
1     747
Name: count, dtype: int64

## 1 -> Spam
## 0 -> Not Spam 

In [21]:
df["Category"].value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

## Applying processing and stemming to Message 

In [26]:
port_stem = PorterStemmer()

In [27]:
def stemming(message):
    message_new = re.sub('[^a-zA-Z]',' ',message) ## getting only those words which have letters in it
    message_new = message_new.lower() ## making all the content in small case
    message_new = message_new.split() ## converting it to a list
    message_new = [port_stem.stem(word) for word in message_new if not word in stopwords.words("english")]
    message_new = " ".join(message_new)
    return message_new

In [28]:
df["Message"] = df["Message"].apply(stemming)

In [30]:
X= df["Message"]
Y= df["Category"]
df.head()

Unnamed: 0,Category,Message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


## Performing feature extraxtion to Convert Message column to numerical values or certain features 

In [45]:
vector1 = TfidfVectorizer()

In [46]:
vector1.fit(df["Message"])
X = vector1.transform(df["Message"])


# Training the model 

In [47]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,stratify = Y,random_state =2)


In [49]:
model=LogisticRegression()
model.fit(X_train,Y_train)
pred = model.predict(X_train)

## Prediction Model accuracy on the training set

In [51]:
accuracy1 = accuracy_score(pred,Y_train)
print(accuracy1)

0.9715054969710568


In [54]:
accuracy2 = accuracy_score(model.predict(X_test),Y_test)
print(accuracy2)

0.9650224215246637
