<a href="https://colab.research.google.com/github/AmirMasoudes/SpamFiltering-/blob/main/spamfiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import requests
import tarfile
import io
import os

# بارگیری دیتاست Enron Email
def load_enron_dataset():
    url = 'https://www.cs.cmu.edu/~enron/enron1.tar.gz'
    response = requests.get(url)
    tar = tarfile.open(fileobj=io.BytesIO(response.content), mode="r:gz")
    tar.extractall()

    emails = []
    labels = []

    def read_emails_from_folder(folder, label):
        for email_file in os.listdir(folder):
            with open(os.path.join(folder, email_file), 'r', encoding='latin-1') as file:
                emails.append(file.read())
                labels.append(label)

    read_emails_from_folder('enron1/spam', 1)
    read_emails_from_folder('enron1/ham', 0)

    df = pd.DataFrame({'message': emails, 'label': labels})
    return df

df = load_enron_dataset()

# استفاده از نمونه کوچکتر برای سرعت بیشتر
df = df.sample(n=2000, random_state=42)

# پیش‌پردازش داده‌ها
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['message'] = df['message'].apply(preprocess_text)

# تقسیم داده‌ها به مجموعه آموزش و تست
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# تبدیل متن به ویژگی‌ها
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# ایجاد و آموزش مدل Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# ارزیابی مدل
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# پیش‌بینی پیام‌های جدید
new_messages = ["Free entry in 2 a wkly comp to win FA Cup fina...", "Hey, how are you doing?"]
new_messages_preprocessed = [preprocess_text(message) for message in new_messages]
new_messages_vectorized = vectorizer.transform(new_messages_preprocessed)
predictions = model.predict(new_messages_vectorized)
print(predictions)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.979372197309417
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

[1 0]
