In [None]:
# prompt: data from drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def preprocess_text(text):
    # Check if the text is NaN or not a string
    if pd.isna(text) or not isinstance(text, str):
        return ""  # Return an empty string if the text is NaN or not a string
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/spam_Emails_data.csv', encoding='latin1')

In [None]:
data.head()

Unnamed: 0,label,text
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...
1,Ham,got ice thought look az original message ice o...
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...
3,Spam,start increasing your odds of success & live s...
4,Ham,author jra date escapenumber escapenumber esca...


In [None]:
data['text'] = data['text'].apply(preprocess_text)

**Split the dataset into features (text) and labels**


In [None]:
X = data['text']
y = data['label']

**Convert text data into numerical feature vectors using CountVectorizer**

In [None]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

**Split data into train and test sets**

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

**Initialize SVM classifier**


In [None]:
svm_model = SVC(kernel='linear')

**Train SVM classifier**


In [None]:
svm_model.fit(X_train, y_train)

 **Predict on the test set**


In [None]:
y_pred = svm_model.predict(X_test)

**Compute and print performance metrics**

Compute the confusion matrix

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

Compute other performance metrics

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

Print the results

In [None]:
print("Confusion Matrix:")
print(conf_matrix)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[20148   319]
 [  365 17939]]

Accuracy: 0.982357947950788
Precision: 0.9823583297529467
Recall: 0.982357947950788
F1 Score: 0.9823567514837233

Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      0.98      0.98     20467
        Spam       0.98      0.98      0.98     18304

    accuracy                           0.98     38771
   macro avg       0.98      0.98      0.98     38771
weighted avg       0.98      0.98      0.98     38771



**# Evaluate model performance**

In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.982357947950788


In [None]:
def predict_new_text(new_texts, model):
    # Preprocess new texts
    new_texts_preprocessed = [preprocess_text(text) for text in new_texts]
    # Convert new texts to the same vectorized format as the training data
    new_texts_vectorized = vectorizer.transform(new_texts_preprocessed)
    # Use the trained model to predict
    predictions = svm_model.predict(new_texts_vectorized)
    # Convert numeric predictions to labels
    predicted_labels = ['spam' if prediction == 1 else 'ham' for prediction in predictions]
    # Debug: Print final labeled predictions
    print("Predicted labels:", predicted_labels)
    return predicted_labels

In [None]:
def predict_new_texts(new_texts, svm_model, vectorizer):
    new_texts = [preprocess_text(text) for text in new_texts]
    new_texts_vectorized = vectorizer.transform(new_texts)
    return svm_model.predict(new_texts_vectorized)

# Test the prediction function
new_emails = ["Please review the attached comprehensive analytics report covering all Q1 metrics and provide your feedback by EOD.",
              "Welcome aboard, Jane! Looking forward to your insights on our current projects and seeing you at the kickoff meeting next Monday."
]
predictions = predict_new_texts(new_emails,svm_model, vectorizer)
for email, prediction in zip(new_emails, predictions):
    print(f'Email: {email}\nPrediction: {prediction}\n')

Email: Please review the attached comprehensive analytics report covering all Q1 metrics and provide your feedback by EOD.
Prediction: Ham

Email: Welcome aboard, Jane! Looking forward to your insights on our current projects and seeing you at the kickoff meeting next Monday.
Prediction: Ham

