In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

In [None]:
data_path = "data/chosen.jsonl"
tools_dir = "tools"

test_split = 0.2
seed = 42

In [None]:
os.makedirs(tools_dir, exist_ok=True)

# Wczytywanie danych

Analiza danych została przeprowadzona w poprzednim etapie. Teraz wczytujemy przetworzone dane z końca poprzedniego etapu. Został wybrany podzbiór atrybutów, które według nas i przeprowadzonej analizy mają największy potencjał w byciu informatywnymi.

In [None]:
df = pd.read_json(data_path, lines=True)

In [None]:
df

In [None]:
X = df.drop("bought", axis=1)
y = df["bought"].astype('int')

- X - wejścia
- y - zmienna celu

# Przygotowanie danych

#### Sanity check

In [None]:
_ = y.hist()

In [None]:
np.count_nonzero(y == 1) / np.count_nonzero(y == 0)

Klasy są niezbalansowane (w proporcji około 1:3)

In [None]:
X.isnull().sum().sum()

Nie ma już żadnych braków

#### Podział na zbiór uczący i testowy

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, random_state=seed)

#### Kodowanie

In [None]:
numerical_columns = X_train.select_dtypes("number").columns
categorical_columns = X_train.select_dtypes(exclude="number").columns

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train[categorical_columns])

In [None]:
encoded_categorical_train = pd.DataFrame(encoder.transform(X_train[categorical_columns]).toarray(), columns=encoder.get_feature_names(input_features=categorical_columns))
encoded_categorical_train.index = X_train.index
encoded_categorical_test = pd.DataFrame(encoder.transform(X_test[categorical_columns]).toarray(), columns=encoder.get_feature_names(input_features=categorical_columns))
encoded_categorical_test.index = X_test.index

In [None]:
X_train = pd.concat([X_train[numerical_columns], encoded_categorical_train], axis=1)
X_test = pd.concat([X_test[numerical_columns], encoded_categorical_test], axis=1)

In [None]:
X_train

Po zakodowaniu mamy 36 kolumn

In [None]:
with open(tools_dir + '/encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

Zapiszemy enkoder, żeby użyć go później na serwerze

#### Normalizacja

In [None]:
scaler = StandardScaler().fit(X_train)

In [None]:
X_train.loc[:, :] = scaler.transform(X_train)
X_test.loc[:, :] = scaler.transform(X_test)

In [None]:
X_train

In [None]:
with open(tools_dir + '/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

Zapiszemy... skaler(?), żeby użyć go później na serwerze

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

# Uczenie

In [None]:
from sklearn.dummy import DummyClassifier
clfD = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
clfA = LogisticRegression().fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clfB = RandomForestClassifier(max_depth=10).fit(X_train, y_train)

Tworzymy bazowe klasyfikatory przed strojeniem hiperparametrów

Następnie robimy strojenie

In [None]:
from sklearn.metrics import make_scorer
scorer = make_scorer(balanced_accuracy_score)

In [None]:
params = {
    "penalty": ["none", "l2"],
    "tol": [1e-4, 1e-5, 1e-3],
    "C": [1, 2, 3],
    "fit_intercept": [True, False],
    "class_weight": ["balanced", None],
    "max_iter": np.arange(50, 150)
}
model = LogisticRegression()
grid = RandomizedSearchCV(model, params, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.score(X_test, y_test))
grid.best_params_


Strojenie regresji daje prawie niezmienione parametry domyślne

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [3,5,8,12,15,30],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
}
model = RandomForestClassifier(class_weight="balanced", n_estimators=500)
grid = GridSearchCV(model, params, scoring=scorer, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.score(X_test, y_test))
grid.best_params_


Wyjście GridSearchCV:
(Po długim czasie)

0.6839481308144144

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'log2'}
 
 Komórka GridSearch zamieniona na Markdown aby nie można jej było przez przypadek uruchomić

Strojenie Random Forest

In [None]:
clfA_tuned = LogisticRegression(tol=1e-5, penalty='l2', max_iter=78, fit_intercept=True,
                          class_weight=None, C=2).fit(X_train, y_train)

In [None]:
clfB_tuned = RandomForestClassifier(max_features='log2', max_depth=8, n_estimators=500, criterion='entropy', 
                                    bootstrap=True, class_weight="balanced", n_jobs=-1).fit(X_train, y_train)

Tworzymy modele z nastrojonymi hiperparametrami

# Porównanie

Najpierw porównujemy wyniki modeli nienastrojonych:

In [None]:
print("Zbiór treningowy\nBalanced Accuracy Score: ")
print(balanced_accuracy_score(y_train, clfA.predict(X_train)))

disp = plot_precision_recall_curve(clfA, X_train, y_train)

print("Zbiór walidacyjny\nScore: ")
print(clfA.score(X_test, y_test))

print("Balanced Accuracy Score: ")
print(balanced_accuracy_score(y_test, clfA.predict(X_test)))

disp = plot_precision_recall_curve(clfA, X_test, y_test)

In [None]:
print("Zbiór treningowy\nBalanced Accuracy Score: ")
print(balanced_accuracy_score(y_train, clfB.predict(X_train)))

disp = plot_precision_recall_curve(clfB, X_train, y_train)

print("Zbiór walidacyjny\nScore: ")
print(clfB.score(X_test, y_test))

print("Balanced Accuracy Score: ")
print(balanced_accuracy_score(y_test, clfB.predict(X_test)))

disp = plot_precision_recall_curve(clfB, X_test, y_test)

Następnie wyniki modeli nastrojonych:

In [None]:
print("Zbiór treningowy\nBalanced Accuracy Score: ")
print(balanced_accuracy_score(y_train, clfA_tuned.predict(X_train)))

disp = plot_precision_recall_curve(clfA_tuned, X_train, y_train)

print("Zbiór walidacyjny\nScore: ")
print(clfA_tuned.score(X_test, y_test))

print("Balanced Accuracy Score: ")
print(balanced_accuracy_score(y_test, clfA_tuned.predict(X_test)))

disp = plot_precision_recall_curve(clfA_tuned, X_test, y_test)

In [None]:
print("Zbiór treningowy\nBalanced Accuracy Score: ")
print(balanced_accuracy_score(y_train, clfB_tuned.predict(X_train)))

disp = plot_precision_recall_curve(clfB_tuned, X_train, y_train)

print("Zbiór walidacyjny\nScore: ")
print(clfB_tuned.score(X_test, y_test))

print("Balanced Accuracy Score: ")
print(balanced_accuracy_score(y_test, clfB_tuned.predict(X_test)))

disp = plot_precision_recall_curve(clfB_tuned, X_test, y_test)

Jak można zauważyć, strojenie Random Forest znacznie poprawia wyniki

Na koniec zapisujemy modele do plików zewnętrznych

In [None]:
with open(tools_dir + '/A.pkl', 'wb') as f:
    pickle.dump(clfA_tuned, f)
    
with open(tools_dir + '/B.pkl', 'wb') as f:
    pickle.dump(clfB_tuned, f)