In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

In [None]:
data_path = "data/chosen.jsonl"
tools_dir = "tools"

test_split = 0.2
seed = 42

In [None]:
os.makedirs(tools_dir, exist_ok=True)

# Wczytywanie danych

Analiza danych została przeprowadzona w poprzednim etapie. Teraz wczytujemy przetworzone dane z końca poprzedniego etapu. Został wybrany podzbiór atrybutów, które według nas i przeprowadzonej analizy mają największy potencjał w byciu informatywnymi.

In [None]:
df = pd.read_json(data_path, lines=True)

In [None]:
df

In [None]:
X = df.drop("bought", axis=1)
y = df["bought"].astype('int')

- X - wejścia
- y - zmienna celu

# Przygotowanie danych

#### Sanity check

In [None]:
_ = y.hist()

In [None]:
np.count_nonzero(y == 1) / np.count_nonzero(y == 0)

Klasy są niezbalansowane (w proporcji około 1:3)

In [None]:
X.isnull().sum().sum()

Nie ma już żadnych braków

#### Podział na zbiór uczący i testowy

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, random_state=seed)

#### Kodowanie

In [None]:
numerical_columns = X_train.select_dtypes("number").columns
categorical_columns = X_train.select_dtypes(exclude="number").columns

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train[categorical_columns])

In [None]:
encoded_categorical_train = pd.DataFrame(encoder.transform(X_train[categorical_columns]).toarray(), columns=encoder.get_feature_names(input_features=categorical_columns))
encoded_categorical_train.index = X_train.index
encoded_categorical_test = pd.DataFrame(encoder.transform(X_test[categorical_columns]).toarray(), columns=encoder.get_feature_names(input_features=categorical_columns))
encoded_categorical_test.index = X_test.index

In [None]:
X_train = pd.concat([X_train[numerical_columns], encoded_categorical_train], axis=1)
X_test = pd.concat([X_test[numerical_columns], encoded_categorical_test], axis=1)

In [None]:
X_train

Po zakodowaniu mamy 36 kolumn

In [None]:
with open(tools_dir + '/encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

Zapiszemy enkoder, żeby użyć go później na serwerze

#### Normalizacja

In [None]:
scaler = StandardScaler().fit(X_train)

In [None]:
X_train.loc[:, :] = scaler.transform(X_train)
X_test.loc[:, :] = scaler.transform(X_test)

In [None]:
X_train

In [None]:
with open(tools_dir + '/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

Zapiszemy... skaler(?), żeby użyć go później na serwerze

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

# Uczenie

In [None]:
from sklearn.linear_model import LogisticRegression
clfA = LogisticRegression().fit(X_train, y_train)

In [None]:
from sklearn.dummy import DummyClassifier
clfA = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clfB = RandomForestClassifier(max_depth=10).fit(X_train, y_train)

In [None]:
params = {
    "penalty": ["none", "l2"],
    "tol": [1e-4, 1e-5, 1e-3],
    "C": [1, 2, 3],
    "fit_intercept": [True, False],
    "class_weight": ["balanced", None],
    "max_iter": np.arange(50, 150)
}
model = LogisticRegression()
grid = RandomizedSearchCV(model, params)
grid.fit(X_train, y_train)
print(grid.score(X_test, y_test))
grid.best_params_


In [None]:
params = {
    'max_depth': np.arange(1, 30),
    'max_features': np.arange(4, 20),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}
model = RandomForestClassifier(n_jobs=-1)
grid = RandomizedSearchCV(model, params)
grid.fit(X_train, y_train)
print(grid.score(X_test, y_test))
grid.best_params_


In [None]:
clfA_tuned = LogisticRegression(tol=0.0001, penalty='l2', max_iter=105, fit_intercept=True,
                          class_weight=None, C=3).fit(X_train, y_train)

In [None]:
clfB_tuned = RandomForestClassifier(min_samples_split=5, min_samples_leaf=10, max_features=7, max_depth=14,
                              criterion='entropy', bootstrap=False, class_weight="balanced", n_jobs=-1).fit(X_train, y_train)

# Porównanie

In [None]:
testArray = np.array(y_test)

In [None]:
acc_score = balanced_accuracy_score(y_train, clfA.predict(X_train))
print(acc_score)

disp = plot_precision_recall_curve(clfA, X_train, y_train)

pred = clfA.predict(X_test)
correct = 0
for i in range(pred.size):
    if pred[i] == testArray[i]:        
        correct += 1
print("Correct to incorrect ratio: ")
print(correct/pred.size)

acc_score = balanced_accuracy_score(y_test, pred)
print("Balanced Accuracy Score: ")
print(acc_score)

disp = plot_precision_recall_curve(clfA, X_test, y_test)

In [None]:
acc_score = balanced_accuracy_score(y_train, clfB.predict(X_train))
print(acc_score)

disp = plot_precision_recall_curve(clfB, X_train, y_train)

pred = clfB.predict(X_test)
correct = 0
for i in range(pred.size):
    if pred[i] == testArray[i]:        
        correct += 1
print("Correct to incorrect ratio: ")
print(correct/pred.size)

acc_score = balanced_accuracy_score(y_test, pred)
print("Balanced Accuracy Score: ")
print(acc_score)

disp = plot_precision_recall_curve(clfB, X_test, y_test)

In [None]:
acc_score = balanced_accuracy_score(y_train, clfA_tuned.predict(X_train))
print(acc_score)

disp = plot_precision_recall_curve(clfA_tuned, X_train, y_train)

pred = clfA_tuned.predict(X_test)
correct = 0
for i in range(pred.size):
    if pred[i] == testArray[i]:        
        correct += 1
print("Correct to incorrect ratio: ")
print(correct/pred.size)

acc_score = balanced_accuracy_score(y_test, pred)
print("Balanced Accuracy Score: ")
print(acc_score)

disp = plot_precision_recall_curve(clfA_tuned, X_test, y_test)

In [None]:
acc_score = balanced_accuracy_score(y_train, clfB_tuned.predict(X_train))
print(acc_score)

disp = plot_precision_recall_curve(clfB_tuned, X_train, y_train)

pred = clfB_tuned.predict(X_test)
correct = 0
for i in range(pred.size):
    if pred[i] == testArray[i]:        
        correct += 1
print("Correct to incorrect ratio: ")
print(correct/pred.size)

acc_score = balanced_accuracy_score(y_test, pred)
print("Balanced Accuracy Score: ")
print(acc_score)

disp = plot_precision_recall_curve(clfB_tuned, X_test, y_test)

In [None]:
with open(tools_dir + '/clfA.pkl', 'wb') as f:
    pickle.dump(clfA_tuned, f)
    
with open(tools_dir + '/clfB.pkl', 'wb') as f:
    pickle.dump(clfB_tuned, f)