In [22]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [4]:
df = pd.read_csv('word.csv')
df

Unnamed: 0,lable,text
0,4,خوب
1,4,متأسفانه جهت بانوان باشگاه و استخر نبود
2,3,اتاق‌ها کهنه و وسایل قدیمی سرو صدای زیاد خیابان
3,3,بالشت و خوشخواب تخت افتضاح بود جوری که صبح کمر...
4,4,برای سمپوزیوم رفته بودم که به طور کلی صبحانه م...
...,...,...
15208,5,بسیار عالی هتلی در حد هتل پنج ستاره بنده برای ...
15209,5,بسیار از هتل راضی بودم و به نظرم واقعا عالی بود
15210,5,همه چیز عالی بود تمیزی، امکانات، پرسنل، نزدیکی...
15211,4,مدت زمان چک این و چک اوت زیاد است.


In [5]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [6]:
df = pd.concat([df, tfidf_df], axis=1)
df = df.drop(df.columns[1], axis=1)

In [7]:
X = df.drop(df.columns[0], axis=1)
Y = df.iloc[:, 0]

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [8]:
num_classes = len(np.unique(Y))
y_train = to_categorical(y_train - 1, num_classes)
y_val = to_categorical(y_val - 1, num_classes)
y_test = to_categorical(y_test - 1, num_classes)

# MLP

In [9]:
def create_model(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(1024, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
kf = KFold(n_splits=5)
for train_index, val_index in kf.split(X_train):
    X_train_kf, X_val_kf = X_train[train_index], X_train[val_index]
    y_train_kf, y_val_kf = y_train[train_index], y_train[val_index]
    
    model = create_model(X_train.shape[1], num_classes)
    model.fit(X_train_kf, y_train_kf, epochs=50, batch_size=64, validation_data=(X_val_kf, y_val_kf), callbacks=[early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


In [11]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

Validation Accuracy: 59.24%


In [12]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 61.70%


# Naïve Bayes

In [9]:
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [11]:
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()
y_test = y_test.values.ravel()

In [16]:
nb_model = MultinomialNB()
scores = cross_val_score(nb_model, X_train, y_train, cv=5)
print(f'Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%')

Cross-Validation Accuracy: 61.74%


In [17]:
nb_model.fit(X_train, y_train)
y_val_pred = nb_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

Validation Accuracy: 59.63%


In [18]:
y_test_pred = nb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Test Accuracy: 62.42%


In [19]:
print(classification_report(y_test, y_test_pred, target_names=[str(i) for i in range(1, len(np.unique(Y)) + 1)]))

              precision    recall  f1-score   support

           1       0.25      0.04      0.07        25
           2       0.55      0.12      0.20        48
           3       0.42      0.38      0.40       177
           4       0.61      0.66      0.63       652
           5       0.70      0.72      0.71       620

    accuracy                           0.62      1522
   macro avg       0.50      0.39      0.40      1522
weighted avg       0.62      0.62      0.61      1522



# Logistic Regression

In [12]:
logreg_model = LogisticRegression(max_iter=1000)
scores = cross_val_score(logreg_model, X_train, y_train, cv=5)
print(f'Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%')

Cross-Validation Accuracy: 61.61%


In [13]:
logreg_model.fit(X_train, y_train)
y_val_pred = logreg_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

Validation Accuracy: 61.34%


In [14]:
y_test_pred = logreg_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Test Accuracy: 60.84%


In [15]:
print(classification_report(y_test, y_test_pred, target_names=[str(i) for i in range(1, len(np.unique(Y)) + 1)]))

              precision    recall  f1-score   support

           1       0.83      0.20      0.32        25
           2       0.38      0.12      0.19        48
           3       0.46      0.29      0.36       177
           4       0.58      0.69      0.63       652
           5       0.67      0.67      0.67       620

    accuracy                           0.61      1522
   macro avg       0.59      0.40      0.43      1522
weighted avg       0.60      0.61      0.60      1522



# CART

In [16]:
cart_model = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(cart_model, X_train, y_train, cv=5)
print(f'Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%')

Cross-Validation Accuracy: 56.16%


In [17]:
cart_model.fit(X_train, y_train)
y_val_pred = cart_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

Validation Accuracy: 55.03%


In [18]:
y_test_pred = cart_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Test Accuracy: 54.01%


In [19]:
print(classification_report(y_test, y_test_pred, target_names=[str(i) for i in range(1, len(np.unique(Y)) + 1)]))

              precision    recall  f1-score   support

           1       0.33      0.32      0.33        25
           2       0.21      0.12      0.16        48
           3       0.38      0.31      0.34       177
           4       0.54      0.57      0.55       652
           5       0.60      0.61      0.61       620

    accuracy                           0.54      1522
   macro avg       0.41      0.39      0.40      1522
weighted avg       0.53      0.54      0.54      1522

