In [91]:
#Email Spam Detector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

EMAIL SPAM DETECTOR

In [92]:
df=pd.read_csv("mail_data.csv")

In [93]:
#data cleaning
data=df[df.notnull().any(axis=1)]

In [94]:
data.head(6)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...


In [95]:
#train_test_split
#ham-1
#spam-0
data.loc[data['Category']=="spam",'Category']=0
data.loc[data['Category']=="ham",'Category']=1

In [96]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [97]:
#data conversion from object to int
data["Catagory"]=data["Category"].astype(int)

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
 2   Catagory  5572 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


In [102]:
X=data["Message"]
Y=data["Category"]

In [119]:
X.shape
Y.shape

(5572,)

In [120]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [121]:
print(X_train.shape)
print(X_test.shape)
print(X.shape)

(4457,)
(1115,)
(5572,)


In [122]:
#converting x test to vector values
feature_extract_model=TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)

X_train_features=feature_extract_model.fit_transform(X_train)
X_test_features=feature_extract_model.transform(X_test)

In [109]:
for i in range(0,2):
    print(X_test_features[i])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (1, 7431)>
  Coords	Values
  (0, 1)	0.2381316303003606
  (0, 9)	0.2852706805264544
  (0, 14)	0.26797874471323896
  (0, 20)	0.30668032384591537
  (0, 306)	0.23975986557206702
  (0, 405)	0.2381316303003606
  (0, 1041)	0.28016206931555726
  (0, 1082)	0.2451068436245027
  (0, 1361)	0.25132445289897426
  (0, 1405)	0.3176863938914351
  (0, 1549)	0.2646498848307188
  (0, 4386)	0.18353336340308998
  (0, 5213)	0.1988547357502182
  (0, 5373)	0.2365698724638063
  (0, 6920)	0.20571591693537986
  (0, 7271)	0.1940327008179069
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (1, 7431)>
  Coords	Values
  (0, 3491)	0.496093956101028
  (0, 4418)	0.3457696891316818
  (0, 4729)	0.22965776503163893
  (0, 6214)	0.3621564482127515
  (0, 6507)	0.26731535902873493
  (0, 6588)	0.3298937975962767
  (0, 6732)	0.42473488678029325
  (0, 7368)	0.29957800964520975


In [123]:
Y_train=Y_train.astype(int)
Y_test=Y_test.astype(int)

In [124]:
#implementing Logistic Regression
model=LogisticRegression()

In [125]:
model.fit(X_train_features,Y_train)

In [126]:
print(Y_train.dtype)
print(np.unique(Y_train))

int64
[0 1]


In [129]:
#accuracy on train set
predictions_train=model.predict(X_train_features)
accuracy_train=accuracy_score(predictions_train,Y_train)
print("Accuracy on train set; " ,accuracy_train)

Accuracy on train set;  0.9676912721561588


In [128]:
X_copy=X.copy()
X_copy["Predictions"]=predictions_train
print(X_copy.info())

<class 'pandas.core.series.Series'>
Index: 5573 entries, 0 to Predictions
Series name: Message
Non-Null Count  Dtype 
--------------  ----- 
5573 non-null   object
dtypes: object(1)
memory usage: 87.1+ KB
None


In [130]:
#accuracy on test set
predictions_test=model.predict(X_test_features)
accuracy_test=accuracy_score(predictions_test,Y_test)
print("Accuracy on train set: ",accuracy_test)

Accuracy on train set:  0.9668161434977578


In [131]:
input_mail=["""FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv."""]

feature_vector=feature_extract_model.transform(input_mail)

prediction=model.predict(feature_vector)

if(prediction[0]==1):
    print("It is a Ham Mail")
else:
    print("It is a Spam Mail")

It is a Ham Mail
