In [None]:
import pandas as pd
import numpy as np

path_to_data_folder = "../Data/"

# بارگذاری داده‌ها
train = pd.read_csv(path_to_data_folder + "train.csv")
test = pd.read_csv(path_to_data_folder + "test.csv")

In [None]:
for col in train.columns:
    print(f"-- {col} -- ")
    print(train[col].info(), end="\n\n")
    print(train[col].describe(), end="\n\n")
    print(train[col].head(3))
    print("--------------------", end="\n\n")

-- age -- 
<class 'pandas.core.series.Series'>
RangeIndex: 8000 entries, 0 to 7999
Series name: age
Non-Null Count  Dtype  
--------------  -----  
7231 non-null   float64
dtypes: float64(1)
memory usage: 62.6 KB
None

count    7231.000000
mean       43.588024
std        14.872058
min        18.000000
25%        31.000000
50%        43.000000
75%        56.000000
max        69.000000
Name: age, dtype: float64

0    54.0
1    34.0
2    39.0
Name: age, dtype: float64
--------------------

-- sleep_cycle -- 
<class 'pandas.core.series.Series'>
RangeIndex: 8000 entries, 0 to 7999
Series name: sleep_cycle
Non-Null Count  Dtype 
--------------  ----- 
7441 non-null   object
dtypes: object(1)
memory usage: 62.6+ KB
None

count           7441
unique             3
top       Early Bird
freq            2506
Name: sleep_cycle, dtype: object

0    Irregular
1    Night Owl
2    Irregular
Name: sleep_cycle, dtype: object
--------------------

-- exercise_habits -- 
<class 'pandas.core.series.Series'>

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# کپی از داده‌ها برای تغییر
train_cleaned = train.copy()
test_cleaned = test.copy()

# 1. حذف سطرهای تکراری در train
train_cleaned = train_cleaned.drop_duplicates()

# 2. مدیریت مقادیر گم‌شده (فیلد age)
# می‌توانیم با مقدار میانه یا میانگین جایگزین کنیم (میانه بهتره برای مقابله با outliers)
median_age = train_cleaned["age"].median()
train_cleaned["age"].fillna(median_age, inplace=True)
test_cleaned["age"].fillna(median_age, inplace=True)

# 3. مدیریت Outlierها در age (بر اساس Z-Score فقط روی train، حذف سطرها مجاز است فقط در train)
z_scores = np.abs(
    (train_cleaned["age"] - train_cleaned["age"].mean()) / train_cleaned["age"].std()
)
train_cleaned = train_cleaned[z_scores < 3]  # فقط مقادیر نرمال نگه‌داری شوند

# 4. Label Encoding برای ستون‌های غیرعددی
categorical_cols = [
    "sleep_cycle",
    "exercise_habits",
    "climate_zone",
    "historical_cuisine_exposure",
]

# تعریف انکدر و اعمالش هم بر train و هم بر test
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined_data = pd.concat(
        [train_cleaned[col], test_cleaned[col]], axis=0
    )  # برای حفظ انسجام بین train و test
    le.fit(combined_data)
    train_cleaned[col] = le.transform(train_cleaned[col])
    test_cleaned[col] = le.transform(test_cleaned[col])
    encoders[col] = le  # نگه‌داشتن انکدر در صورت نیاز بعدی

# 5. مقیاس‌بندی ستون سنی
scaler = StandardScaler()
train_cleaned["age"] = scaler.fit_transform(train_cleaned[["age"]])
test_cleaned["age"] = scaler.transform(test_cleaned[["age"]])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_cleaned['age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_cleaned['age'].fillna(median_age, inplace=True)


In [None]:
print(train_cleaned.head())
print(test_cleaned.head())

        age  sleep_cycle  exercise_habits  climate_zone  \
0  0.711350            1                1             1   
1 -0.668748            2                2             0   
2 -0.323723            1                0             1   
3  1.746424            0                3             1   
4  0.987370            0                2             2   

   historical_cuisine_exposure  preferred_taste  
0                            0                3  
1                            0                3  
2                            1                0  
3                            0                1  
4                            0                1  
        age  sleep_cycle  exercise_habits  climate_zone  \
0  0.021301            2                1             3   
1  0.021301            1                1             0   
2 -1.427802            1                0             3   
3 -0.392728            2                2             0   
4  0.711350            2                0         

In [None]:
from sklearn.model_selection import train_test_split

# تعریف ویژگی‌ها (X) و متغیر هدف (y)
X = train_cleaned.drop(columns=["preferred_taste"])
y = train_cleaned["preferred_taste"]

# تقسیم داده به 80% آموزش و 20% اعتبارسنجی
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify برای حفظ توزیع کلاس‌ها
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. مدل Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_val)
acc_log = accuracy_score(y_val, y_pred_log)
print(f"Logistic Regression Accuracy: {acc_log:.4f}")

# 2. مدل درخت تصمیم
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)
y_pred_tree = dtree.predict(X_val)
acc_tree = accuracy_score(y_val, y_pred_tree)
print(f"Decision Tree Accuracy: {acc_tree:.4f}")

# 3. مدل Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
acc_rf = accuracy_score(y_val, y_pred_rf)
print(f"Random Forest Accuracy: {acc_rf:.4f}")

Logistic Regression Accuracy: 0.5755
Decision Tree Accuracy: 0.8545
Random Forest Accuracy: 0.8536


In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

# پیش‌بینی روی مجموعه آموزش و اعتبارسنجی
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)

# محاسبه f1-score macro
f1_train = f1_score(y_train, y_train_pred, average="macro")
f1_val = f1_score(y_val, y_val_pred, average="macro")

print(f"F1-score (Train, macro): {f1_train:.4f}")
print(f"F1-score (Validation, macro): {f1_val:.4f}")

# گزارش کامل روی مجموعه اعتبارسنجی
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

print("Confusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred))

F1-score (Train, macro): 0.9711
F1-score (Validation, macro): 0.8345

Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       185
           1       0.83      0.81      0.82       400
           2       0.77      0.77      0.77        31
           3       0.88      0.89      0.89       477

    accuracy                           0.85      1093
   macro avg       0.83      0.84      0.83      1093
weighted avg       0.85      0.85      0.85      1093

Confusion Matrix (Validation):
[[161  24   0   0]
 [ 27 322   0  51]
 [  0   0  24   7]
 [  1  43   7 426]]


In [None]:
# پیش‌بینی روی داده‌های آزمون
test_predictions = rf.predict(test_cleaned)

# ساخت دیتا‌فریم خروجی
submission = pd.DataFrame({"preferred_taste": test_predictions})

# نمایش چند سطر اول برای بررسی
print(submission.head())

   preferred_taste
0                2
1                1
2                3
3                1
4                3
