## Email Spam Detection System

#### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Importing the dataset

In [2]:
df = pd.read_csv('spam.csv',encoding='Latin')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [6]:
df.shape

(5572, 5)

In [7]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [8]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [9]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

#### Dropping Unnamed: 1,Unnamed: 2,Unnamed: 3 columns

In [10]:
df.drop('Unnamed: 2',axis=1,inplace=True)
df.drop('Unnamed: 3',axis=1,inplace=True)
df.drop('Unnamed: 4',axis=1,inplace=True)

In [11]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Let's rename the column

In [12]:
df1 = df.rename(columns={'v1':'Category','v2':'Message'})

In [13]:
df1

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [14]:
data = df1.where((pd.notnull(df1)),'')

In [15]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [17]:
data.shape

(5572, 2)

#### If the Category is spam:0,ham:1

In [18]:
data.loc[data['Category']=='spam','Category',]=0
data.loc[data['Category']=='ham','Category',]=1

In [19]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


#### Defining X(indepedent feature) and Y(dependant feature)

In [20]:
X = data['Message']
y = data['Category']

#### Print X

In [21]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [22]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

#### Print y

#### Spliting/dividing dataset into
1. Train dataset
2. Test dataset

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

#### Distribution of dataset for X

In [24]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


#### Distribution of dataset for y

In [25]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(4457,)
(1115,)


#### Feature extraction
##### Transforming data into feature vector

In [26]:
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [27]:
!pip install scikit-learn



In [28]:


!pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: bigframes, fastai, imbalanced-learn, librosa, mlxtend, qudida, sklearn-pandas, yellowbrick


In [29]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_features = feature_extraction.fit_transform(X_train)
x_test_features = feature_extraction.transform(X_test)
Y_train, Y_test = y_train.astype('int'), y_test.astype('int')

In [30]:
print(X_train)

1978    No I'm in the same boat. Still here at my moms...
3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935       They r giving a second chance to rahul dengra.
4078       O i played smash bros  &lt;#&gt;  religiously.
4086    PRIVATE! Your 2003 Account Statement for 07973...
                              ...                        
3772    I came hostel. I m going to sleep. Plz call me...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860                   In work now. Going have in few min.
Name: Message, Length: 4457, dtype: object


In [31]:
print(x_train_features)

  (0, 4520)	0.4658046386365619
  (0, 3210)	0.348722265231364
  (0, 7415)	0.348722265231364
  (0, 1706)	0.3431839629173582
  (0, 4416)	0.4528381701109944
  (0, 1371)	0.4658046386365619
  (1, 0)	0.2654936554684193
  (1, 1649)	0.3059746053542906
  (1, 6440)	0.2953742837684993
  (1, 4533)	0.3059746053542906
  (1, 419)	0.28715203556385105
  (1, 4292)	0.2953742837684993
  (1, 5005)	0.1937920260229529
  (1, 2661)	0.3059746053542906
  (1, 1533)	0.2015782058421696
  (1, 6296)	0.269833648032668
  (1, 3631)	0.2804339696184593
  (1, 3140)	0.3059746053542906
  (1, 1187)	0.26161139982801973
  (2, 2190)	0.5102109014477275
  (2, 5351)	0.5102109014477275
  (2, 1674)	0.35156722029872034
  (2, 5770)	0.3962151014046925
  (2, 3061)	0.44585171875646595
  (3, 5484)	0.4829129976175997
  :	:
  (4451, 5740)	0.3358090891373877
  (4451, 4686)	0.3478605253385091
  (4452, 3402)	0.4536077050510107
  (4452, 3423)	0.4833413012939851
  (4452, 1579)	0.3576443319642905
  (4452, 1781)	0.3311324953642251
  (4452, 5998)	0.3

### Let's train the model

In [32]:
model = LogisticRegression()

In [33]:
model.fit(x_train_features, Y_train)

In [34]:
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [35]:
print('Accuracy on training data :',accuracy_on_training_data)

Accuracy on training data : 0.9694862014808167


In [36]:
predictions_on_test_data = model.predict(x_test_features)
accuaracy_on_test_data = accuracy_score(Y_test, predictions_on_test_data)

In [37]:
print('Accuracy on test data: ',accuaracy_on_test_data)

Accuracy on test data:  0.9524663677130045


### Observation: The accuarcy on training data is almost similar to test data

In [51]:
input_your_mail = ["Get rich quick! Make $$$ now!!! Click here to claim your prize!,Urgent: Your account needs attention. Please verify your credentials.,Congratulations! You've won a free vacation. Claim your prize by replying to this email.,Reminder: Your appointment is scheduled for tomorrow. Please confirm.,Exclusive offer for our valued customers: 50% off all purchases this weekend. Shop now!,Hello, how are you? Let's catch up soon.,Congratulations! You've been selected as the winner of our lottery. Claim your prize by clicking the link below!,"]
input_data_feature = feature_extraction.transform(input_your_mail)
prediction = model.predict(input_data_feature)
print(prediction)
if(prediction[0]==1):
    print("Ham mail")

else:
    print("Spam mail")


[0]
Spam mail


In [45]:
prediction = model.predict(input_data_feature)

In [46]:
print(prediction)

[0]
