<h1>**PZ.U17 - Marketing bankowy.**</h1>
<ul>
    <li>Sebastian Smoliński - nr indeksu </li>
    <li>Oleksandr Drobinin - nr indeksu</li>
    <li>Poniższe analizy opierają sie na pliku bank.csv dostępnym [tutaj](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)

In [None]:
import os
print(os.listdir("../input/"))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from plotly import tools
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#Poniższa linia dla Jupyter Notebooka
init_notebook_mode(connected=True)
#Alternatywne wczytanie pliku danych z tego samego folderu lub z innego miejsca przy podaniu całej ścieżki
#df = pd.read_csv("bank.csv")

MAIN_PATH = '../input/bank-marketing-dataset/'#bank-marketing/ #bank-marketing-dataset/
df = pd.read_csv(MAIN_PATH +'bank.csv') #bank-additional-full.csv #bank.csv
term_deposits = df.copy()
#Wyświetlenie części 
df.head()


<h3> Podstawowe założenia </h3>
<a id="overall_analysis"></a>
***
<ul>
<li type="square">Na potrzeby dalszych operacji przy budowaniu modelów do klasyfikacji usunięto kolumnę "duration", gdyż zbyt mocno oddziaływała wynik działania algorytmów. </li><br>
</ul>




In [None]:
df.describe()

Nie ma żadnych pustych wartości

In [None]:
df.info()

In [None]:
# Rozkład danych
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

df.hist(bins=20, figsize=(14,10), color='#E14906')
plt.show()

In [None]:
df['deposit'].value_counts()

In [None]:
# plt.style.use('dark_background')
fig = plt.figure(figsize=(20,20))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(212)

g = sns.boxplot(x="default", y="balance", hue="deposit",
                    data=df, palette="muted", ax=ax1)

g.set_title("Stan konta w zalężności od długości lokaty")

# ax.set_xticklabels(df["default"].unique(), rotation=45, rotation_mode="anchor")

g1 = sns.boxplot(x="job", y="balance", hue="deposit",
                 data=df, palette="RdBu", ax=ax2)

g1.set_xticklabels(df["job"].unique(), rotation=90, rotation_mode="anchor")
g1.set_title("Rodzaj pracy w zależności od długości trwania lokaty")

g2 = sns.violinplot(data=df, x="education", y="balance", hue="deposit", palette="RdBu_r")

g2.set_title("Stan konta w zależności od wykształcenia")


plt.show()

In [None]:
df.head()

<h3> Analiza struktury zatrudnienia </h3>
<ul> 
    <li><b>Wiek </b>  Tak jak można przewidywać, najniższy średni wiek jest wśród tudentów oraz najwyższy wśród emerytów.</li>
    <li><b> Stan konta: </b> Kadra zarządzająca i emeryci to grupy najbardziej majętne. </li>
    </ul>

In [None]:
# Usunięcie nieznanych form zatrudnienia
df = df.drop(df.loc[df["job"] == "unknown"].index)

#Złączenie administracji i kadry zarządzającej
lst = [df]

for col in lst:
    col.loc[col["job"] == "admin.", "job"] = "management"

In [None]:
# Sprawdzamy, które zawody mają lepszą sytuacje finansową

suscribed_df = df.loc[df["deposit"] == "yes"]

occupations = df["job"].unique().tolist()

# Stan konta po zawodach
management = suscribed_df["age"].loc[suscribed_df["job"] == "management"].values
technician = suscribed_df["age"].loc[suscribed_df["job"] == "technician"].values
services = suscribed_df["age"].loc[suscribed_df["job"] == "services"].values
retired = suscribed_df["age"].loc[suscribed_df["job"] == "retired"].values
blue_collar = suscribed_df["age"].loc[suscribed_df["job"] == "blue-collar"].values
unemployed = suscribed_df["age"].loc[suscribed_df["job"] == "unemployed"].values
entrepreneur = suscribed_df["age"].loc[suscribed_df["job"] == "entrepreneur"].values
housemaid = suscribed_df["age"].loc[suscribed_df["job"] == "housemaid"].values
self_employed = suscribed_df["age"].loc[suscribed_df["job"] == "self-employed"].values
student = suscribed_df["age"].loc[suscribed_df["job"] == "student"].values


ages = [management, technician, services, retired, blue_collar, unemployed, 
         entrepreneur, housemaid, self_employed, student]

colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)',
          'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 
          'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)',
         'rgba(229, 126, 56, 0.5)', 'rgba(229, 56, 56, 0.5)',
         'rgba(174, 229, 56, 0.5)', 'rgba(229, 56, 56, 0.5)']

traces = []

for xd, yd, cls in zip(occupations, ages, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='Rozkład wiekowy w poszczególnych zawodach',
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(224,255,246)',
    plot_bgcolor='rgb(251,251,251)',
    showlegend=False
)

fig = go.Figure(data=traces, layout=layout)
iplot(fig)

<h3> Stan cywilny </h3>

In [None]:
df['marital'].value_counts()

In [None]:
df['marital'].unique()

In [None]:
df['marital'].value_counts().tolist()

In [None]:
# Stan konta w zależności od stanu cywilnego
single = df['balance'].loc[df['marital'] == 'single'].values
married = df['balance'].loc[df['marital'] == 'married'].values
divorced = df['balance'].loc[df['marital'] == 'divorced'].values


single_dist = go.Histogram(
    x=single,
    histnorm='density', 
    name='single',
    marker=dict(
        color='#6E6E6E'
    )
)


married_dist = go.Histogram(
    x=married,
    histnorm='density', 
    name='married',
    marker=dict(
        color='#2E9AFE'
    )
)

divorced_dist = go.Histogram(
    x=divorced,
    histnorm='density', 
    name='divorced',
    marker=dict(
        color='#FA5858'
    )
)


fig = tools.make_subplots(rows=3, print_grid=False)

fig.append_trace(single_dist, 1, 1)
fig.append_trace(married_dist, 2, 1)
fig.append_trace(divorced_dist, 3, 1)


fig['layout'].update(showlegend=False, title="Zarobki w zależności od stanu cywilnego",
                    height=1000, width=800)

iplot(fig, filename='custom-sized-subplot-with-subplot-titles')

In [None]:

df = df.drop(df.loc[df["education"] == "unknown"].index)
df['education'].unique()

In [None]:
df.head()

In [None]:
#Macierz korelacji
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
fig = plt.figure(figsize=(20,20))
df['deposit'] = LabelEncoder().fit_transform(df['deposit'])

# Separate both dataframes into 
numeric_df = df.select_dtypes(exclude="object")
categorical_df = df.select_dtypes(include="object")

corr_numeric = numeric_df.corr()
corr_categorical = categorical_df.corr()

sns.heatmap(corr_numeric, cbar=True, cmap="RdBu_r")
plt.title("Macierz korelacji", fontsize=16)
plt.show()



<h2> <b>Model do klasyfikacji:</b> </h2>

In [None]:
dep = term_deposits['deposit']
term_deposits.drop(labels=['deposit'], axis=1,inplace=True)
term_deposits.insert(0, 'deposit', dep)
term_deposits.head()
# Sprawdzamy i usuwamy kolumny, które zaburzają wynik działania klasyfikatorów przez zbyt duże oddziaływanie na zbiór
term_deposits["housing"].value_counts()/len(term_deposits)

In [None]:
term_deposits["loan"].value_counts()/len(term_deposits)

## Próbkowanie powłokowe: 

W tym miejscu użyliśmy funkcji z pozycji podanej w literaturze, aby podzielić dane do zbiorów w odpowiednich proporcjach tzn. jeśli w pierwotnym modelu "loan" miało 87% "no" i 13% "yes", to i w naszych modelach treningowym i testowym chcemy takie miec, aby wyniki odzwierciedlały jak najlepiej zbiór pierwotny. Dodatkowo należy zaimplementować walidacje krzyżową.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
# Dzielimy zbiór na treningowy i testowy; wartość random_state wynosi 42, bo taką znaleźliśmy jaką powszechnie używaną do inicjalizowania wewnetrznego RNG  innych opracowaniach.
stratified = StratifiedShuffleSplit(n_splits=20, test_size=0.4, random_state=42)

for train_set, test_set in stratified.split(term_deposits, term_deposits["loan"]):
    stratified_train = term_deposits.loc[train_set]
    stratified_test = term_deposits.loc[test_set]
    
stratified_train["loan"].value_counts()/len(df)
stratified_test["loan"].value_counts()/len(df)

In [None]:
#Rozdzielamy etykiety i cechy
train_data = stratified_train # Kopia
test_data = stratified_test
train_data.shape
test_data.shape
train_data['deposit'].value_counts()

In [None]:
# Zasadniczo poniższy fragment służy tylko do przekonwertowania wartości opisowych na wartości numeryczne. Teoretycznie dałoby rade to zrobić przy przerobieniu df = df[columns].apply(LabelEncoder().fit_transform), ale wyniki nie były ciekawe.
# Pobrane z: Hands on Machine Learning with Scikit Learn and Tensorflow; Aurelien Geron.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

#Wybieranie odpowiednich kolumn na potrzeby konwersji
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [None]:
train_data.info()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Pipeline'y do przekazywania danych
numerical_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(["age", "balance", "day", "campaign", "pdays", "previous","duration"])),
    ("std_scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ("select_cat", DataFrameSelector(["job", "education", "marital", "default", "housing", "loan", "contact", "month",
                                     "poutcome"])),
    ("cat_encoder", CategoricalEncoder(encoding='onehot-dense'))
])

from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("numerical_pipeline", numerical_pipeline),
        ("categorical_pipeline", categorical_pipeline),
    ])

In [None]:
X_train = preprocess_pipeline.fit_transform(train_data)
X_train

In [None]:
y_train = train_data['deposit']
y_test = test_data['deposit']
y_train.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()
y_train = encode.fit_transform(y_train)
y_test = encode.fit_transform(y_test)
y_train_yes = (y_train == 1)
y_train
y_train_yes

In [None]:
some_instance = X_train[1200]

In [None]:
#Inicjalizacja klasyfikatorów
import time


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.gaussian_process.kernels import RBF
import xgboost


dict_classifiers = {
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "XGBoost": xgboost.XGBClassifier(),
}

In [None]:
# 
no_classifiers = len(dict_classifiers.keys())

def batch_classify(X_train, Y_train, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,3)), columns = ['Klasyfikator', 'Skuteczność', 'Czas trenowania'])
    count = 100
    for key, classifier in dict_classifiers.items():
        t_start = time.clock()
        classifier.fit(X_train, Y_train)
        t_end = time.clock()
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        df_results.loc[count,'Klasyfikator'] = key
        df_results.loc[count,'Skuteczność'] = train_score
        df_results.loc[count,'Czas trenowania'] = t_diff
        if verbose:
            print("Wytrenowany {c} w {f:.2f} s".format(c=key, f=t_diff))
        count+=1
    return df_results

In [None]:
df_results = batch_classify(X_train, y_train)
print(df_results.sort_values(by='Skuteczność', ascending=False))

### Problem zbytniego dopasowania:
Poprzez wykorzystanie wielokrotnej walidacji krzyżowej dążymy do tego, aby algorytm podążał według wzoru i nie brał pod uwage "szumów"; ma aproksymować wyniki w 

In [None]:
# Walidacja krzyżowa
from sklearn.model_selection import cross_val_score

# Regresja logistyczna
log_reg = LogisticRegression(solver='liblinear')
log_scores = cross_val_score(log_reg, X_train, y_train, cv=20)
log_reg_mean = log_scores.mean()

# XGBoost
xgb = xgboost.XGBClassifier()
xgb_scores = cross_val_score(xgb, X_train, y_train, cv=20)
xgb_mean = xgb_scores.mean()


# Dataframe z wynikami
d = {'Klasyfikatory': ['Regresja Logistyczna','XGB'], 
    'Średnie wyniki': [log_reg_mean, xgb_mean]}

result_df = pd.DataFrame(data=d)

In [None]:
# Wyniki naszych klasyfikatorów
result_df = result_df.sort_values(by=['Średnie wyniki'], ascending=False)
result_df

**Positive/Negative:** Typ decyzji (label) ["No", "Yes"]
**True/False:** Dobrze lub źle zakwalifikowane przez model<br><br>

**True Negatives (Lewo - Góra Kwadrat):** 

**False Negatives (Prawo - Góra Kwadrat):** 

**False Positives (Lewo - Dół Kwadrat):**

**True Positives (Prawo - Dół Kwadrat):** 

In [None]:
# Walidacja krzyżowa dla XGBoosta
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(xgb, X_train, y_train, cv=20)

In [None]:
from sklearn.metrics import accuracy_score
xgb.fit(X_train, y_train)
print ("Dokładność XGB wynosi %2.2f" % accuracy_score(y_train, y_train_pred))

In [None]:
#Macierz dopasowań
from sklearn.metrics import confusion_matrix
# 4697: no's, 4232: yes
conf_matrix = confusion_matrix(y_train, y_train_pred)
f, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", linewidths=.5, ax=ax)
plt.title("Macierz dopasowań XGB",fontsize=20)
plt.subplots_adjust(left=0.15, right=0.99, bottom=0.15, top=0.99)
ax.set_yticks(np.arange(conf_matrix.shape[0]) + 0.5, minor=False)
ax.set_xticklabels("")
ax.set_yticklabels(['Odrzucone', 'Zaakceptowane'], fontsize=16, rotation=360)
plt.show()

In [None]:
#Wartości dla precyzji i zwrotu XGB
from sklearn.metrics import precision_score, recall_score
print('Precision Score: ', precision_score(y_train, y_train_pred))
print('Recall Score: ', recall_score(y_train, y_train_pred))

In [None]:
from sklearn.metrics import f1_score

f1_score(y_train, y_train_pred)

In [None]:
# Walidacja krzyżowa dla LR
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(log_reg, X_train, y_train, cv=20)
from sklearn.metrics import accuracy_score
log_reg.fit(X_train, y_train)
print ("Dokładność LR wynosi %2.2f" % accuracy_score(y_train, y_train_pred))

In [None]:
#Macierz dopasowań
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_train, y_train_pred)
f, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", linewidths=.5, ax=ax)
plt.title("Macierz dopasowań LR",fontsize=20)
plt.subplots_adjust(left=0.15, right=0.99, bottom=0.15, top=0.99)
ax.set_yticks(np.arange(conf_matrix.shape[0]) + 0.5, minor=False)
ax.set_xticklabels("")
ax.set_yticklabels(['Odrzucone', 'Zaakceptowane'], fontsize=16, rotation=360)
plt.show()

In [None]:
#Wartości dla precyzji i zwrotu LR
from sklearn.metrics import precision_score, recall_score
print('Precision Score: ', precision_score(y_train, y_train_pred))
print('Recall Score: ', recall_score(y_train, y_train_pred))

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
f1_score(y_train, y_train_pred)
print('Precision Score: ', precision_score(y_train, y_train_pred))
print('Recall Score: ', recall_score(y_train, y_train_pred))

In [None]:
from sklearn.metrics import f1_score

f1_score(y_train, y_train_pred)

# Zwrot i precyzja
<a id="precision_recall"></a>
**Zwrot:** Ile decyzji "yes" wykrył nasz model <br><br>
**Precyzja:** Pewność naszego modelu, że dana decyzja to "yes

Zbyt duża precyzja powoduje, że model może na przykładzie poprzednich przykładów przewidzieć, że dana decyzja to "no", kiedy tak naprawde to jest "yes".

In [None]:
y_scores = log_reg.decision_function([some_instance])
y_scores

In [None]:
# Zmiana progu dla zwrotów
threshold = 0
y_some_digit_pred = (y_scores > threshold)

Co ma największy wpływ?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
plt.style.use('seaborn-white')
import random as rd

#Konwersja
term_deposits['job'] = term_deposits['job'].astype('category').cat.codes
term_deposits['marital'] = term_deposits['marital'].astype('category').cat.codes
term_deposits['education'] = term_deposits['education'].astype('category').cat.codes
term_deposits['contact'] = term_deposits['contact'].astype('category').cat.codes
term_deposits['poutcome'] = term_deposits['poutcome'].astype('category').cat.codes
term_deposits['month'] = term_deposits['month'].astype('category').cat.codes
term_deposits['default'] = term_deposits['default'].astype('category').cat.codes
term_deposits['loan'] = term_deposits['loan'].astype('category').cat.codes
term_deposits['housing'] = term_deposits['housing'].astype('category').cat.codes

#Tworzenie podzbiorów i testy
target_name = 'deposit'
X = term_deposits.drop('deposit', axis=1)

def getParam():
    param = {
        'eta': rd.random() / 3 + 0.001,            # O ile posuwamy się do przodu po każdej iteracji (nie za duży bo przeskoczymy)
        'max_depth': rd.randint( 7, 17 ),          # maksymalna głębokość drzewa decyzyjnego
        'min_child_weight': rd.randint( 1, 10 ),   # Minimalna liczba obserwacji w każdym liściu drzewa
        'gamma': rd.randint( 0, 6 ),              # Zmniejszenie strat wymaganych do utworzenia kolejnego węzła
        'n_estimators': rd.randint( 70, 150 ),     # liczba drzewek, które chcemy zbudować
        'subsample': 1 - rd.random() / 4,          # ile obserwacji bierzemy do budowy drzewka (żeby nie przeuczyć)
        'colsample_bytree':  1 - rd.random() / 4   # jaki procent charakterystyk chcemy brać do budowy prostego drzewka
    }
    return param
label=term_deposits[target_name]

X_train, X_test, y_train, y_test = train_test_split(X,label,test_size=0.2, random_state=42, stratify=label)

#Drzewo decyzyjne
tree = tree.DecisionTreeClassifier(
    class_weight='balanced',
    min_weight_fraction_leaf = 0.01
)
    
resultFile=open('csv_to_submit.csv', "wt")
maxAccuracy = 0.0
maxParam = {}  
rd.seed()

for i in range( 100):
    param = getParam()
    model = xgboost.XGBClassifier( eta = param[ 'eta' ],
                max_depth = param[ 'max_depth' ],
                min_child_weight = param[ 'min_child_weight' ],
                gamma = param[ 'gamma' ],
                n_estimators = param[ 'n_estimators' ],
                subsample = param[ 'subsample' ],
                colsample_bytree =  param[ 'colsample_bytree' ]
            )
    model.fit( X_train, y_train )
    yPred = model.predict( X_test )
    ytrain = model.predict( X_train )
    preds = [  value  for value in yPred ]
    print( f"iteration: { i }\t{ param }" )
    accuracy = accuracy_score( y_test, preds )
    print( "Accuracy: %.4f%%" % ( accuracy * 100.0 ) )
    resultFile.write( f"For parameters = { param }\tAccuracy = { accuracy }\n" )
    if maxAccuracy < accuracy:
        maxAccuracy = accuracy
        maxParam = param
resultFile.write( f"""\n\nBest accuracy = { maxAccuracy } for parameters:\n
    eta = { maxParam[ 'eta' ] }
    max_depth = { maxParam[ 'max_depth' ] }
    min_child_weight = { maxParam[ 'min_child_weight' ] }
    gamma = { maxParam[ 'gamma' ] }
    n_estimators = { maxParam[ 'n_estimators' ] }
    subsample = { maxParam[ 'subsample' ] }
    colsample_bytree = { maxParam[ 'colsample_bytree' ] }""" )
resultFile.close()

#resultFile.to_csv('csv_to_submit.csv', index = False)

#tree = tree.fit(X_train, y_train)
importances = model.feature_importances_
feature_names = term_deposits.drop('deposit', axis=1).columns
indices = np.argsort(importances)[::-1]


print("Ranking cech:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

def feature_importance_graph(indices, importances, feature_names):
    plt.figure(figsize=(12,6))
    plt.title("Waga parametrów na klasyfikator", fontsize=18)
    plt.barh(range(len(indices)), importances[indices], color='#31B173',  align="center")
    plt.yticks(range(len(indices)), feature_names[indices], rotation='horizontal',fontsize=14)
    plt.ylim([-1, len(indices)])  
feature_importance_graph(indices, importances, feature_names)
plt.show()