<a href="https://colab.research.google.com/github/Ayushi1245/CODSOFT/blob/main/Spam_mail_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the dependencies

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, r2_score


Data collection and Pre-Processing

In [21]:
#loading data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('/content/spam.csv', encoding='latin-1', engine = "python", usecols = ['v1','v2'])

In [22]:
raw_mail_data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [23]:
#replace null value with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [24]:
#printing first 10 rows of the dataframe
mail_data.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [25]:
# checking size of the dataframe
mail_data.shape

(5572, 2)

Label Encoding

In [26]:
# label spam mail as 0 an ham mail as 1.
mail_data.columns = ["Category", "message"]
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

spam as 0
ham as 1

In [28]:
# separating the data as texts and label
x= mail_data['message']
y= mail_data['Category']

In [29]:
x

Unnamed: 0,message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."
...,...
5567,This is the 2nd time we have tried 2 contact u...
5568,Will Ì_ b going to esplanade fr home?
5569,"Pity, * was in mood for that. So...any other s..."
5570,The guy did some bitching but I acted like i'd...


In [30]:
y

Unnamed: 0,Category
0,1
1,1
2,0
3,1
4,1
...,...
5567,0
5568,1
5569,1
5570,1


Spliting data into training and testing data

In [31]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=3)

In [32]:
len(x_train)

4457

In [33]:
len(x_test)

1115

Feature Extraction

In [34]:
# transform the text data to feature vectors that can be used as input to the Logistic regression
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

# convert y_train and y_test values as integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [35]:
print(x_test_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7766 stored elements and shape (1115, 7510)>
  Coords	Values
  (0, 1537)	0.667337188824809
  (0, 4294)	0.5159375448718375
  (0, 6007)	0.537093591660729
  (1, 1)	0.21260233518669944
  (1, 43)	0.24547458936715755
  (1, 321)	0.28671640581392144
  (1, 520)	0.1934450786526249
  (1, 602)	0.28671640581392144
  (1, 2899)	0.1385795841356552
  (1, 3300)	0.37297727661877506
  (1, 3365)	0.28671640581392144
  (1, 4045)	0.250549335510249
  (1, 5250)	0.28671640581392144
  (1, 5347)	0.2733682162643466
  (1, 5501)	0.28671640581392144
  (1, 6579)	0.2733682162643466
  (1, 6599)	0.14954692788663673
  (1, 7222)	0.23059492898537964
  (2, 2939)	0.47195476517479323
  (2, 2941)	0.6068486133983123
  (2, 4070)	0.44361668503137164
  (2, 6648)	0.3410121739015846
  (2, 6701)	0.30969080396105314
  (3, 1606)	0.28517759021090444
  (3, 2649)	0.303870736800912
  :	:
  (1111, 2458)	0.42325261089251354
  (1111, 3259)	0.44776220819286267
  (1111, 6093)	0.4671914

Training the model

Loigistic Regression

In [36]:
model= LogisticRegression()

In [37]:
# training th elogistic regression model with the training data
model.fit(x_train_features,y_train)

Evaluating the trained model

In [38]:
# prediction on training data
prediction_on_training_data = model.predict(x_train_features)
model_accuracy_training = accuracy_score(y_train,prediction_on_training_data)

In [39]:
model_accuracy_training

0.9661207089970832

In [40]:
# prediction on test data
prediction_on_test_data = model.predict(x_test_features)
model_accuracy = accuracy_score(y_test,prediction_on_test_data)

In [41]:
model_accuracy

0.9623318385650225

Building a Predictive System

In [42]:
input_mail = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
prediction
if prediction[0]==1:
    print('Ham mail')
else:
    print('Spam mail')

Spam mail


# Train Naive Bayes Classifier

In [44]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(x_train_features,y_train)

In [46]:
nb_predictions = naive_bayes_model.predict(x_test_features)

In [49]:

print("Naive Bayes Model:")
print(confusion_matrix(y_test, nb_predictions))
print(classification_report(y_test, nb_predictions))
print("Accuracy: ", accuracy_score(y_test, nb_predictions))
print("r2_Score: ", r2_score(y_test, nb_predictions))
print("Precision_score: ", precision_score(y_test, nb_predictions))
print("Recall_score: ", recall_score(y_test, nb_predictions))
print("f1_score: ", f1_score(y_test, nb_predictions))

Naive Bayes Model:
[[123  32]
 [  0 960]]
              precision    recall  f1-score   support

           0       1.00      0.79      0.88       155
           1       0.97      1.00      0.98       960

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy:  0.9713004484304932
r2_Score:  0.7602150537634409
Precision_score:  0.967741935483871
Recall_score:  1.0
f1_score:  0.9836065573770492
