# **Machine learning project2**

# **Task4: Spam Mail**

In [8]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/spam.csv', encoding='latin-1')

# Inspect the first few rows
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [9]:
df.rename(columns={'v1':'label', 'v2':'message'}, inplace=True)
# Rename columns for convenience
df.columns = ['label', 'message','','','']

# Preprocess the labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the training data, transform the testing data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# Initialize the classifiers
nb_model = MultinomialNB()
lr_model = LogisticRegression(max_iter=1000)
svc_model = SVC(kernel='linear')

# Train the models
nb_model.fit(X_train_tfidf, y_train)
lr_model.fit(X_train_tfidf, y_train)
svc_model.fit(X_train_tfidf, y_train)

# Predict using the trained models
nb_pred = nb_model.predict(X_test_tfidf)
lr_pred = lr_model.predict(X_test_tfidf)
svc_pred = svc_model.predict(X_test_tfidf)



In [12]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the models
nb_accuracy = accuracy_score(y_test, nb_pred)
lr_accuracy = accuracy_score(y_test, lr_pred)
svc_accuracy = accuracy_score(y_test, svc_pred)

print("Naive Bayes Accuracy: ", nb_accuracy)
print("Logistic Regression Accuracy: ", lr_accuracy)
print("SVM Accuracy: ", svc_accuracy)

print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_pred))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, lr_pred))
print("\nSVM Classification Report:\n", classification_report(y_test, svc_pred))

print("\nNaive Bayes Confusion Matrix:\n", confusion_matrix(y_test, nb_pred))
print("\nLogistic Regression Confusion Matrix:\n", confusion_matrix(y_test, lr_pred))
print("\nSVM Confusion Matrix:\n", confusion_matrix(y_test, svc_pred))


Naive Bayes Accuracy:  0.9668161434977578
Logistic Regression Accuracy:  0.9524663677130045
SVM Accuracy:  0.979372197309417

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115


SVM Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
 

In [13]:
# test the message
message='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..'
pred1=nb_model.predict( tfidf.transform([message]))

if(pred1[0]==0):
  print("ham")
else:
  print("spam")
print(pred1)


ham
[0]
