In [8]:
import numpy as np
import pandas as pd

print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)


Numpy version: 2.1.1
Pandas version: 2.2.2


In [9]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re


In [13]:
# Load dataset (replace with your dataset path if required)
df = pd.read_csv(r"C:\Users\GF63\anaconda3\envs\textclassificationproject-env\data\spam_assassin.csv")

# View the first few rows
df.head()


Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [17]:
# Define preprocessing function
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the dataset
df['processed_text'] = df['text'].apply(preprocess_text)
X = df['processed_text']
y = df['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train the model
model = MultinomialNB()
model.fit(X_train_vect, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test_vect)
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GF63\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1172
           1       1.00      0.72      0.84       567

    accuracy                           0.91      1739
   macro avg       0.94      0.86      0.89      1739
weighted avg       0.92      0.91      0.90      1739



In [20]:
# Define the function to preprocess and predict
def predict_email_category(email_text, model, vectorizer):
    # Preprocess the text (cleaning, tokenization, etc.)
    processed_text = preprocess_text(email_text)
    
    # Transform the text to the same feature space as the training data
    text_features = vectorizer.transform([processed_text])
    
    # Predict the category
    prediction = model.predict(text_features)
    
    # Map the prediction to the original labels
    if prediction == 0:
        return "Non-Spam"
    else:
        return "Spam"


# Predict the category of the email
prediction = predict_email_category(email_text, model, vectorizer)
print(f"The email is classified as: {prediction}")


The email is classified as: Non-Spam


In [21]:
# Define the email content
email_text = """
Dear Danish Akhtar ., 

Join us for an insightful webinar in collaboration with Datamatics, where we will delve into the latest trends in RPA.

Explore advancements in automation, hyper-automation & enhanced capabilities through AI/ML & Natural Language Processing.

Gain valuable insights about career opportunities & job market trends in this rapidly evolving field.

Don't miss-out on this fantastic opportunity to stay ahead in the world of Robotic Process Automation!
"""

# Predict the category of the email
prediction = predict_email_category(email_text, model, vectorizer)
print(f"The email is classified as: {prediction}")


The email is classified as: Non-Spam
