In [3]:
import nltk
import pandas as pd
from nltk.corpus import movie_reviews
import random


In [4]:
nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

reviews_df = pd.DataFrame({
    'text': [' '.join(words) for words, label in documents],
    'label': [label for words, label in documents]
})

reviews_df.head()

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Unnamed: 0,text,label
0,""" the 44 caliber killer has struck again . "" s...",neg
1,""" you leave little notes on my pillow . i told...",pos
2,"note : ordinarily , moviereviews . org will no...",pos
3,"the blues brothers was a wonderful film , a hi...",neg
4,""" payback , "" brian helgeland ' s inauspicious...",neg


In [5]:
import string
from nltk.corpus import stopwords

# دانلود stopwords
nltk.download('stopwords')

# مجموعه کلمات بی‌معنی انگلیسی
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # lowercase کردن
    text = text.lower()
    # حذف علائم نگارشی
    text = text.translate(str.maketrans('', '', string.punctuation))
    # حذف stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # دوباره تبدیل به متن
    return ' '.join(words)

# اعمال پاکسازی روی ستون text
reviews_df['clean_text'] = reviews_df['text'].apply(clean_text)

# نمایش 5 نمونه
reviews_df[['text', 'clean_text', 'label']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,clean_text,label
0,""" the 44 caliber killer has struck again . "" s...",44 caliber killer struck starring john leguiza...,neg
1,""" you leave little notes on my pillow . i told...",leave little notes pillow told million times s...,pos
2,"note : ordinarily , moviereviews . org will no...",note ordinarily moviereviews org give away cri...,pos
3,"the blues brothers was a wonderful film , a hi...",blues brothers wonderful film hilarious comedy...,neg
4,""" payback , "" brian helgeland ' s inauspicious...",payback brian helgeland inauspicious directing...,neg


In [6]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ساخت TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # حداکثر ۵۰۰۰ ویژگی
X = vectorizer.fit_transform(reviews_df['clean_text']).toarray()

# برچسب‌ها
y = reviews_df['label']

# نمایش شکل داده‌ها
print("شکل X:", X.shape)
print("شکل y:", y.shape)


شکل X: (2000, 5000)
شکل y: (2000,)


In [8]:
# بررسی توزیع برچسب‌ها
reviews_df['label'].value_counts()


label
neg    1000
pos    1000
Name: count, dtype: int64

In [9]:
import numpy as np

# ویژگی طول متن (تعداد کلمات)
text_lengths = reviews_df['clean_text'].apply(lambda x: len(x.split()))

# ترکیب ویژگی طول متن با داده‌های TF-IDF
X_with_length = np.hstack((X, np.array(text_lengths).reshape(-1, 1)))

print("شکل داده‌ها بعد از اضافه کردن طول متن:", X_with_length.shape)


شکل داده‌ها بعد از اضافه کردن طول متن: (2000, 5001)


In [10]:
from sklearn.model_selection import train_test_split

# تقسیم داده‌ها به آموزش و تست
X_train, X_test, y_train, y_test = train_test_split(
    X_with_length, y, test_size=0.2, random_state=42
)

print("شکل X_train:", X_train.shape)
print("شکل X_test:", X_test.shape)
print("شکل y_train:", y_train.shape)
print("شکل y_test:", y_test.shape)


شکل X_train: (1600, 5001)
شکل X_test: (400, 5001)
شکل y_train: (1600,)
شکل y_test: (400,)


In [11]:
from sklearn.linear_model import LogisticRegression

# ساخت مدل
model = LogisticRegression(max_iter=1000)

# آموزش مدل روی داده‌های آموزش
model.fit(X_train, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# پیش‌بینی روی داده‌های تست
y_pred = model.predict(X_test)

# محاسبه دقت
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# نمایش ماتریس درهم‌ریختگی
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# گزارش طبقه‌بندی
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.845

Confusion Matrix:
 [[170  34]
 [ 28 168]]

Classification Report:
               precision    recall  f1-score   support

         neg       0.86      0.83      0.85       204
         pos       0.83      0.86      0.84       196

    accuracy                           0.84       400
   macro avg       0.85      0.85      0.84       400
weighted avg       0.85      0.84      0.85       400



In [13]:
from sklearn.model_selection import GridSearchCV

# محدوده پارامترها برای تست
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print("بهترین پارامترها:", grid.best_params_)
print("بهترین دقت در کراس-ولیدیشن:", grid.best_score_)

# آموزش مدل با بهترین پارامترها
best_model = grid.best_estimator_
best_pred = best_model.predict(X_test)

print("\nدقت نهایی روی تست:", accuracy_score(y_test, best_pred))



بهترین پارامترها: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
بهترین دقت در کراس-ولیدیشن: 0.8412499999999999

دقت نهایی روی تست: 0.855


In [16]:
import joblib

# ذخیره مدل
joblib.dump(best_model, "sentiment_model.pkl")

# ذخیره TF-IDF Vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("مدل و وکتورایزر ذخیره شدند.")


مدل و وکتورایزر ذخیره شدند.


In [18]:
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(reviews_df['clean_text']).toarray()

reviews_df['text_length'] = reviews_df['clean_text'].apply(len)
X = np.hstack((X_tfidf, reviews_df[['text_length']].values))


In [19]:
# پیش‌پردازش جمله جدید
sample_clean = clean_text(sample_text)

# TF-IDF
sample_vec = loaded_vectorizer.transform([sample_clean]).toarray()

# طول متن
sample_length = len(sample_clean)

# ترکیب TF-IDF + طول متن
sample_features = np.hstack((sample_vec, [[sample_length]]))

# پیش‌بینی
prediction = loaded_model.predict(sample_features)
print("نظر مدل:", prediction[0])


نظر مدل: pos


In [20]:
import pandas as pd
import numpy as np

# ذخیره کل دیتاست
reviews_df.to_csv("full_reviews.csv", index=False)

# ذخیره داده‌های آموزش
train_df = pd.DataFrame(X_train, columns=[f"feature_{i}" for i in range(X_train.shape[1])])
train_df['label'] = y_train
train_df.to_csv("train_data.csv", index=False)

# ذخیره داده‌های تست
test_df = pd.DataFrame(X_test, columns=[f"feature_{i}" for i in range(X_test.shape[1])])
test_df['label'] = y_test
test_df.to_csv("test_data.csv", index=False)

print("✅ فایل‌ها ساخته شدند: full_reviews.csv, train_data.csv, test_data.csv")


✅ فایل‌ها ساخته شدند: full_reviews.csv, train_data.csv, test_data.csv
