<a href="https://colab.research.google.com/github/AmirMasoudes/Practice/blob/main/SpamFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import requests
import tarfile
import io
import os

# بارگیری دیتاست نظرات IMDB
def load_imdb_dataset():
    url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    response = requests.get(url)
    tar = tarfile.open(fileobj=io.BytesIO(response.content), mode="r:gz")
    tar.extractall()

    def read_reviews(path, sample_size=500):
        reviews = []
        sentiments = []
        for label in ['pos', 'neg']:
            sentiment = 1 if label == 'pos' else 0
            labeled_path = os.path.join(path, label)
            for review_file in os.listdir(labeled_path)[:sample_size]:
                with open(os.path.join(labeled_path, review_file), 'r', encoding='utf-8') as file:
                    reviews.append(file.read())
                    sentiments.append(sentiment)
        return reviews, sentiments

    train_reviews, train_sentiments = read_reviews('aclImdb/train', sample_size=500)
    test_reviews, test_sentiments = read_reviews('aclImdb/test', sample_size=500)

    train_df = pd.DataFrame({'review': train_reviews, 'sentiment': train_sentiments})
    test_df = pd.DataFrame({'review': test_reviews, 'sentiment': test_sentiments})

    return train_df, test_df

train_df, test_df = load_imdb_dataset()

# پیش‌پردازش داده‌ها
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

train_df['review'] = train_df['review'].apply(preprocess_text)
test_df['review'] = test_df['review'].apply(preprocess_text)

# تبدیل متن به ویژگی‌ها
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['review'])
X_test = vectorizer.transform(test_df['review'])
y_train = train_df['sentiment']
y_test = test_df['sentiment']

# ایجاد و آموزش مدل Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# ارزیابی مدل
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# پیش‌بینی نظرات جدید
new_reviews = ["This movie was fantastic!", "I did not like this film at all."]
new_reviews_preprocessed = [preprocess_text(review) for review in new_reviews]
new_reviews_vectorized = vectorizer.transform(new_reviews_preprocessed)
predictions = model.predict(new_reviews_vectorized)
print(predictions)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.795
              precision    recall  f1-score   support

           0       0.75      0.88      0.81       500
           1       0.86      0.71      0.77       500

    accuracy                           0.80      1000
   macro avg       0.80      0.79      0.79      1000
weighted avg       0.80      0.80      0.79      1000

[1 0]
