In [None]:
# prompt: data from drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

nltk.download("punkt")
nltk.download("stopwords")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess_text(text):
    # Check if the text is NaN or not a string
    if pd.isna(text) or not isinstance(text, str):
        return ""  # Return an empty string if the text is NaN or not a string
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/spam_Emails_data.csv', encoding='latin1')

In [None]:
data.head()

Unnamed: 0,label,text
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...
1,Ham,got ice thought look az original message ice o...
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...
3,Spam,start increasing your odds of success & live s...
4,Ham,author jra date escapenumber escapenumber esca...


In [None]:
data['text'] = data['text'].apply(preprocess_text)

**Split the dataset into features (text) and labels**

In [None]:
X = data['text']
y = data['label']

**Convert text data into numerical feature vectors using CountVectorizer**

In [None]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

**Split data into train and test set**s

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

**Initialize Random forest classifier**

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

**Train Random Forest classifier**

In [None]:
rf_model.fit(X_train, y_train)

**Predict on the test set**

In [None]:
y_pred = rf_model.predict(X_test)

 **Compute and print performance metrics**

Compute the confusion matrix

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

Compute other performance metrics

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

Print the results

In [None]:
print("Confusion Matrix:")
print(conf_matrix)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Confusion Matrix:
[[20373    94]
 [  639 17665]]

Accuracy: 0.9810941167367362
Precision: 0.9814471993503695
Recall: 0.9810941167367362
F1 Score: 0.9810754635943761


**Evaluate model performance**

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9810941167367362


In [None]:
def predict_new_text(new_texts, model):
    # Preprocess new texts
    new_texts_preprocessed = [preprocess_text(text) for text in new_texts]
    # Convert new texts to the same vectorized format as the training data
    new_texts_vectorized = vectorizer.transform(new_texts_preprocessed)
    # Use the trained model to predict
    predictions = rf_model.predict(new_texts_vectorized)
    # Convert numeric predictions to labels
    predicted_labels = ['spam' if prediction == 1 else 'ham' for prediction in predictions]
    # Debug: Print final labeled predictions
    print("Predicted labels:", predicted_labels)
    return predicted_labels

In [None]:
def predict_new_texts(new_texts, rf_model, vectorizer):
    new_texts = [preprocess_text(text) for text in new_texts]
    new_texts_vectorized = vectorizer.transform(new_texts)
    return rf_model.predict(new_texts_vectorized)

# Test the prediction function
new_emails = ["Please join the team meeting on Monday at 10 AM in the conference room. We'll discuss our progress and next steps.",
              "Book your dream vacation now and save up to 70% on travel costs. Limited time offer!"
]
predictions = predict_new_texts(new_emails,rf_model, vectorizer)
for email, prediction in zip(new_emails, predictions):
    print(f'Email: {email}\nPrediction: {prediction}\n')

Email: Please join the team meeting on Monday at 10 AM in the conference room. We'll discuss our progress and next steps.
Prediction: Ham

Email: Book your dream vacation now and save up to 70% on travel costs. Limited time offer!
Prediction: Spam



In [None]:
import joblib

In [None]:
joblib.dump(rf_model, 'rndf_model.joblib')

['rndf_model.joblib']

In [None]:
import pickle

In [None]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('rndf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)