<a href="https://colab.research.google.com/github/Athulkrishna-S/L-and-T/blob/main/spam_email_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
# Load the dataset
df = pd.read_csv('emails.csv')

# Display the first few rows
print(df.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [4]:
# Check for missing values
print(df.isnull().sum())

text    0
spam    0
dtype: int64


In [7]:
df = df.rename(columns={'spam': 'label'})

In [None]:
print(df.head())

                                                text  label
0  Subject: naturally irresistible your corporate...      1
1  Subject: the stock trading gunslinger  fanny i...      1
2  Subject: unbelievable new homes made easy  im ...      1
3  Subject: 4 color printing special  request add...      1
4  Subject: do not have money , get software cds ...      1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [9]:
# vectorize the text part
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
X_train_dense = X_train_tfidf.toarray()
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(X_train_dense, columns=feature_names)
print(df_tfidf)

            00  000  0000  000000  00000000  0000000000  000000000003619  \
0     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
1     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
2     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
3     0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
4     0.130672  0.0   0.0     0.0       0.0         0.0              0.0   
...        ...  ...   ...     ...       ...         ...              ...   
4577  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
4578  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
4579  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
4580  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   
4581  0.000000  0.0   0.0     0.0       0.0         0.0              0.0   

      000000000003991  000000000003997  000000000005168  ...  zwwyw  zwzm  \
0         

**SVM LINEAR KERNEL**

In [11]:
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train_tfidf, y_train)
y_pred_linear = svm_linear.predict(X_test_tfidf)
print("SVM Linear Kernel Classification Report:")
print(classification_report(y_test, y_pred_linear))

SVM Linear Kernel Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       856
           1       0.99      0.98      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



**RBF Kernel**

In [12]:
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train_tfidf, y_train)
y_pred_rbf = svm_rbf.predict(X_test_tfidf)
print("SVM RBF Kernel Classification Report:")
print(classification_report(y_test, y_pred_rbf))

SVM RBF Kernel Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       856
           1       0.99      0.97      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.98      0.99      1146
weighted avg       0.99      0.99      0.99      1146



**Naive Bayes**

In [13]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       856
           1       1.00      0.58      0.73       290

    accuracy                           0.89      1146
   macro avg       0.94      0.79      0.83      1146
weighted avg       0.91      0.89      0.88      1146



In [14]:
import pickle

# Save the Linear SVM model
with open('svm_linear_model.pkl', 'wb') as file:
    pickle.dump(svm_linear, file)

# Save the RBF SVM model
with open('svm_rbf_model.pkl', 'wb') as file:
    pickle.dump(svm_rbf, file)

# Save the Naive Bayes model
with open('naive_bayes_model.pkl', 'wb') as file:
    pickle.dump(nb_model, file)
