In [None]:
import pandas as pd
import numpy as np


path_to_data_folder = "../Data/"

train = pd.read_csv(path_to_data_folder + "train.csv")
test = pd.read_csv(path_to_data_folder + "test.csv")

In [None]:
for col in train.columns:
    print(f"-- {col} -- ")
    print(train[col].info(), end="\n\n")
    print(train[col].describe(), end="\n\n")
    print(train[col].head(3))
    print("--------------------", end="\n\n")

-- customer_id -- 
<class 'pandas.core.series.Series'>
RangeIndex: 4000 entries, 0 to 3999
Series name: customer_id
Non-Null Count  Dtype 
--------------  ----- 
4000 non-null   object
dtypes: object(1)
memory usage: 31.4+ KB
None

count                                     4000
unique                                    4000
top       d3b99f7e-d0ac-4199-8813-91665f16b3fb
freq                                         1
Name: customer_id, dtype: object

0    d3b99f7e-d0ac-4199-8813-91665f16b3fb
1    4fd24bc2-10ac-4a18-96cd-1f062ba5024c
2    0a77605c-e86e-4b72-91a1-7a49969c1ac1
Name: customer_id, dtype: object
--------------------

-- age -- 
<class 'pandas.core.series.Series'>
RangeIndex: 4000 entries, 0 to 3999
Series name: age
Non-Null Count  Dtype
--------------  -----
4000 non-null   int64
dtypes: int64(1)
memory usage: 31.4 KB
None

count    4000.000000
mean       43.686250
std        15.482029
min        18.000000
25%        30.000000
50%        44.000000
75%        57.000000
max    

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 1. حذف ستون customer_id
train = train.drop(columns=["customer_id"])
test = test.drop(columns=["customer_id"])

# 2. بررسی مقادیر گمشده (در صورت وجود)
print("Missing values in train:\n", train.isnull().sum())
print("Missing values in test:\n", test.isnull().sum())

# فرض می‌کنیم داده‌ها مقدار گمشده ندارند؛ در غیر این صورت باید مدیریت کنیم.

# 3. جداسازی ستون‌های عددی و دسته‌ای
numerical_cols = [
    "age",
    "watch_hours",
    "last_login_days",
    "monthly_fee",
    "number_of_profiles",
    "avg_watch_time_per_day",
]
categorical_cols = [
    "gender",
    "subscription_type",
    "region",
    "device",
    "payment_method",
    "favorite_genre",
]

# 4. کدگذاری و مقیاس‌بندی
# برای ساده بودن، ستون‌های دسته‌ای را با One-Hot Encoding و عددی‌ها را با StandardScaler تبدیل می‌کنیم.

# ستون هدف:
target = "churned"

# تعریف پیش‌پردازنده
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# اجرای پیش‌پردازش روی داده آموزشی
X_train = train.drop(columns=[target])
y_train = train[target]

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(test)

print("Preprocessing done.")
print(f"Shape of processed train data: {X_train_processed.shape}")
print(f"Shape of processed test data: {X_test_processed.shape}")

Missing values in train:
 age                       0
gender                    0
subscription_type         0
watch_hours               0
last_login_days           0
region                    0
device                    0
monthly_fee               0
payment_method            0
number_of_profiles        0
avg_watch_time_per_day    0
favorite_genre            0
churned                   0
dtype: int64
Missing values in test:
 age                       0
gender                    0
subscription_type         0
watch_hours               0
last_login_days           0
region                    0
device                    0
monthly_fee               0
payment_method            0
number_of_profiles        0
avg_watch_time_per_day    0
favorite_genre            0
dtype: int64
Preprocessing done.
Shape of processed train data: (4000, 35)
Shape of processed test data: (1000, 35)


In [None]:
from sklearn.model_selection import train_test_split

# تقسیم داده‌ها به 80% آموزش و 20% اعتبارسنجی
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_processed, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Training set shape: {X_train_split.shape}")
print(f"Validation set shape: {X_val_split.shape}")

Training set shape: (3200, 35)
Validation set shape: (800, 35)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# تعریف مدل رگرسیون لجستیک
model = LogisticRegression(max_iter=1000, random_state=42)

# آموزش مدل روی داده‌های آموزش
model.fit(X_train_split, y_train_split)

# پیش‌بینی روی داده‌های اعتبارسنجی
y_val_pred = model.predict(X_val_split)

# ارزیابی مدل
accuracy = accuracy_score(y_val_split, y_val_pred)
print(f"Accuracy on validation set: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_val_split, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val_split, y_val_pred))

Accuracy on validation set: 0.8850

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       398
           1       0.87      0.90      0.89       402

    accuracy                           0.89       800
   macro avg       0.89      0.88      0.88       800
weighted avg       0.89      0.89      0.88       800

Confusion Matrix:
[[346  52]
 [ 40 362]]


In [None]:
from sklearn.metrics import (
    f1_score,
    confusion_matrix,
    classification_report,
    accuracy_score,
)

# پیش‌بینی روی داده‌های آموزش (برای مقایسه)
y_train_pred = model.predict(X_train_split)

# محاسبه f1-score با میانگین macro
f1_train = f1_score(y_train_split, y_train_pred, average="macro")
f1_val = f1_score(y_val_split, y_val_pred, average="macro")

print(f"F1 Score (macro) on Training set: {f1_train:.4f}")
print(f"F1 Score (macro) on Validation set: {f1_val:.4f}")

# در صورت نیاز می‌توان گزارش دقیق‌تری هم چاپ کرد:
print("\n--- Training Set Classification Report ---")
print(classification_report(y_train_split, y_train_pred))

print("\n--- Validation Set Classification Report ---")
print(classification_report(y_val_split, y_val_pred))

F1 Score (macro) on Training set: 0.8975
F1 Score (macro) on Validation set: 0.8850

--- Training Set Classification Report ---
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1590
           1       0.90      0.90      0.90      1610

    accuracy                           0.90      3200
   macro avg       0.90      0.90      0.90      3200
weighted avg       0.90      0.90      0.90      3200


--- Validation Set Classification Report ---
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       398
           1       0.87      0.90      0.89       402

    accuracy                           0.89       800
   macro avg       0.89      0.88      0.88       800
weighted avg       0.89      0.89      0.88       800



In [None]:
# پیش‌بینی روی داده‌های تست (که فقط ویژگی‌ها هستند)
y_test_pred = model.predict(X_test_processed)

# ساخت DataFrame خروجی
submission = pd.DataFrame({"churned": y_test_pred})

submission.head()

Unnamed: 0,churned
0,0
1,1
2,1
3,1
4,1
