In [1]:
# Imports
from pathlib import Path
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

import joblib  # modeli kaydetmek için

# Klasörler
DATA_DIR = Path("../../data/raw")
MODEL_DIR = Path("../../models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

DATA_DIR, MODEL_DIR


(WindowsPath('../../data/raw'), WindowsPath('../../models'))

In [2]:
# Load dataset
data_path = DATA_DIR / "online_shoppers_intention.csv"
df = pd.read_csv(data_path)

df["Revenue"] = df["Revenue"].astype(int)
y = df["Revenue"]
X = df.drop(columns=["Revenue"])

# --- Feature Engineering (daha önce yaptığımızla aynı) ---

X_fe = X.copy()

# Total_Duration
X_fe["Total_Duration"] = (
    X_fe["Administrative_Duration"]
    + X_fe["Informational_Duration"]
    + X_fe["ProductRelated_Duration"]
)

# PageValue_per_Product
X_fe["PageValue_per_Product"] = X_fe["PageValues"] / (X_fe["ProductRelated"] + 1e-3)

# Is_HighSeason
high_season_months = ["Nov", "Dec", "Mar", "May"]
X_fe["Is_HighSeason"] = X_fe["Month"].isin(high_season_months).astype(int)

# Is_ReturningVisitor
X_fe["Is_ReturningVisitor"] = (X_fe["VisitorType"] == "Returning_Visitor").astype(int)

print("Original shape:", X.shape)
print("FE shape:", X_fe.shape)
X_fe.head()


Original shape: (12330, 17)
FE shape: (12330, 21)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Total_Duration,PageValue_per_Product,Is_HighSeason,Is_ReturningVisitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,1,1,1,1,Returning_Visitor,False,0.0,0.0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,...,2,2,1,2,Returning_Visitor,False,64.0,0.0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,4,1,9,3,Returning_Visitor,False,0.0,0.0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,...,3,2,2,4,Returning_Visitor,False,2.666667,0.0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,...,3,3,1,4,Returning_Visitor,True,627.5,0.0,0,1


In [3]:
# Train / test split (evaluation için)
X_train, X_test, y_train, y_test = train_test_split(
    X_fe,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((9864, 21), (2466, 21))

In [4]:
# Numeric / categorical kolonlar
numeric_features_fe = X_fe.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features_fe = X_fe.select_dtypes(include=["object", "bool"]).columns.tolist()

print("Numeric:", numeric_features_fe)
print("Categorical:", categorical_features_fe)

# Preprocessing pipeline
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features_fe),
        ("cat", categorical_transformer, categorical_features_fe),
    ]
)


Numeric: ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Total_Duration', 'PageValue_per_Product', 'Is_HighSeason', 'Is_ReturningVisitor']
Categorical: ['Month', 'VisitorType', 'Weekend']


In [5]:
# Final model pipeline: Optimized RandomForest

final_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", RandomForestClassifier(
            n_estimators=150,
            min_samples_split=5,
            min_samples_leaf=8,
            max_features="sqrt",
            max_depth=None,
            class_weight="balanced",
            random_state=42,
        ))
    ]
)

final_model


0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,150
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
# Train on train split, evaluate on test (son kez performans kontrolü)

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"Final Pipeline - Test Accuracy: {acc:.4f}")
print(f"Final Pipeline - Test ROC-AUC: {auc:.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred))


Final Pipeline - Test Accuracy: 0.8751
Final Pipeline - Test ROC-AUC: 0.9288

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.89      0.92      2084
           1       0.57      0.77      0.66       382

    accuracy                           0.88      2466
   macro avg       0.76      0.83      0.79      2466
weighted avg       0.90      0.88      0.88      2466



In [7]:
# Final training on full data (train + test birlikte)
# Amaç: production/deployment için tüm veriden maksimum fayda almak

final_model_full = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", RandomForestClassifier(
            n_estimators=150,
            min_samples_split=5,
            min_samples_leaf=8,
            max_features="sqrt",
            max_depth=None,
            class_weight="balanced",
            random_state=42,
        ))
    ]
)

final_model_full.fit(X_fe, y)

print("Final model trained on full dataset:", X_fe.shape)


Final model trained on full dataset: (12330, 21)


In [8]:
# Save final model to disk

model_path = MODEL_DIR / "final_rf_pipeline.pkl"
joblib.dump(final_model_full, model_path)

model_path


WindowsPath('../../models/final_rf_pipeline.pkl')

In [9]:
# Load model and make a sample prediction (sanity check)

loaded_model = joblib.load(model_path)

# Testten bir örnek alalım
sample_row = X_test.iloc[[0]]
true_label = y_test.iloc[0]

pred_proba = loaded_model.predict_proba(sample_row)[0, 1]
pred_label = loaded_model.predict(sample_row)[0]

print("True label:", true_label)
print("Predicted label:", pred_label)
print("Predicted proba (Revenue=1):", pred_proba)


True label: 0
Predicted label: 0
Predicted proba (Revenue=1): 0.01767674095790588


# Final Pipeline – Son Model ve Kaydetme Adımı

Bu notebook'ta, önceki aşamalarda optimize edilen RandomForest modeli,
feature engineering ve preprocessing adımları ile birlikte tek bir **pipeline**
içinde birleştirilmiştir.

---

##  Final Pipeline Yapısı

- **Feature Engineering**
  - Total_Duration
  - PageValue_per_Product
  - Is_HighSeason
  - Is_ReturningVisitor

- **Preprocessing**
  - Numeric değişkenler için: StandardScaler
  - Kategorik değişkenler için: OneHotEncoder(handle_unknown="ignore")

- **Model**
  - RandomForestClassifier (class_weight="balanced")  
  - Optimize edilmiş hiperparametreler:
    - n_estimators = 150  
    - min_samples_split = 5  
    - min_samples_leaf = 8  
    - max_features = "sqrt"  
    - max_depth = None  

---

##  Performans Özeti (Train/Test Split Üzerinde)

Final pipeline, train/test ayrımı üzerinden tekrar eğitilip test edildiğinde:

- Accuracy: ~0.88–0.89  
- ROC-AUC: ~0.93  
- Revenue=1 sınıfı için F1-score: ~0.66  
- Revenue=1 sınıfı için Recall: ~0.77  

Bu sonuçlar, önceki baseline ve ara modellerden daha iyi olup
final model olarak seçilmesini desteklemektedir.

---

##  Modelin Kaydedilmesi

- Final model, tüm veri (train + test) üzerinde yeniden eğitilmiştir.
- Eğitilen pipeline `models/final_rf_pipeline.pkl` dosyasına `joblib.dump` ile kaydedilmiştir.
- Örnek bir satır üzerinde yapılan yükleme ve tahmin testi, modelin
  diske doğru şekilde yazıldığını ve tekrar yüklenebildiğini göstermiştir.

Bu dosya, hem API entegrasyonu hem de ileride yapılacak değerlendirmeler için
**üretim ortamına alınabilecek son model** olarak kullanılacaktır.
