In [1]:
import pandas as pd
import numpy as np
from gmpy2 import numer

In [3]:
purchases_df = pd.read_parquet("../data/purchases_df.parquet").astype({
        "OperatingSystems": "category",
        "Browser": "category",
        "Region": "category",
        "TrafficType": "category",
        "Weekend": "int8",
    })
purchases_df.sample(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
6631,7,46.3,0,0.0,2,9.0,0.0,0.025,0.0,0.0,Sep,2,4,1,2,New_Visitor,0,False
7869,4,101.7,0,0.0,40,708.554762,0.009302,0.016434,0.0,0.0,Aug,2,2,3,13,Returning_Visitor,0,False
3210,0,0.0,0,0.0,4,536.5,0.0,0.05,0.0,0.0,May,2,2,9,2,Returning_Visitor,0,False
8880,0,0.0,0,0.0,7,42.0,0.057143,0.085714,0.0,0.0,Dec,2,4,6,1,Returning_Visitor,1,False
4588,0,0.0,0,0.0,10,701.0,0.0,0.03,0.0,0.0,May,1,1,4,3,Returning_Visitor,1,False


In [5]:
X = purchases_df.drop(columns="Revenue")
y = purchases_df["Revenue"]

### Bazowy model drzew decyzyjnych

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
categorical_features = purchases_df.select_dtypes(["category", "object"]).columns
categorical_features

Index(['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType',
       'VisitorType'],
      dtype='object')

In [8]:
numerical_features = purchases_df.select_dtypes([int, float]).columns
numerical_features

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay'],
      dtype='object')

In [9]:
categorical_transformer = Pipeline(steps=[
        ("one_hot_encoding", OneHotEncoder(handle_unknown="ignore")),
])

In [10]:
preprocessor = ColumnTransformer(transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("numerical", "passthrough", numerical_features),
])

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
dt_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(3, 50, 4),
    "min_samples_split": [2, 4, 8, 16],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": [None, "sqrt", "log2"],
    "class_weight": [None, "balanced"],
}

In [15]:
dt_classifier = Pipeline(steps=[
        ("preprocessing", preprocessor),
        ("decision_tree", GridSearchCV(DecisionTreeClassifier(random_state=253),
                                       param_grid=dt_param_grid,
                                       n_jobs=-1,
                                       scoring="f1",
                                       verbose=1,
                                       cv=5,
                                       return_train_score=True))
])
dt_classifier.fit(X, y)

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


In [25]:
dt_classifier.named_steps["decision_tree"].best_score_

0.655682165362291

Udało nam się osiągnąć pewien wynik F1 i postaramy się go przebić korzystając z metod ensemble, które poznaliśmy wcześniej.

### Próba z Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rf_param_grid = {
    "n_estimators": [2, 5, 10, 25],
    "criterion": ["gini", "entropy"],
    "max_depth": range(3, 9, 2),
    "min_samples_split": [2, 4, 8, 16],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": [None, "sqrt", "log2"],
    "class_weight": [None, "balanced"],
}

In [28]:
rf_classifier = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("Random_forest", GridSearchCV(RandomForestClassifier(random_state=253),
                                   param_grid=rf_param_grid,
                                   n_jobs=-1,
                                   scoring="f1",
                                   cv=5,
                                   return_train_score=True))
])

In [29]:
rf_classifier.fit(X, y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [30]:
print("Jedno drzewo: ", dt_classifier.named_steps["decision_tree"].best_score_, "\nRandom Forest: ", rf_classifier.named_steps["Random_forest"].best_score_)

Jedno drzewo:  0.655682165362291 
Random Forest:  0.6615774510287404


## Porównanie z Extra Trees

In [31]:
from sklearn.ensemble import ExtraTreesClassifier

In [32]:
et_classifier = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("Extra_tree", GridSearchCV(ExtraTreesClassifier(random_state=253),
                                param_grid=rf_param_grid,
                                scoring="f1",
                                n_jobs=-1,
                                cv=5,
                                return_train_score=True
                                ))
])
et_classifier.fit(X, y)

In [33]:
et_classifier.named_steps["Extra_tree"].best_score_

0.6365355447413861

In [34]:
print("Jedno drzewo: ", dt_classifier.named_steps["decision_tree"].best_score_, "\nRandom Forest: ", rf_classifier.named_steps["Random_forest"].best_score_, "\nExtra Trees: ", et_classifier.named_steps["Extra_tree"].best_score_)

Jedno drzewo:  0.655682165362291 
Random Forest:  0.6615774510287404 
Extra Trees:  0.6365355447413861
