In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('./data/german_credit_data.csv').drop('Unnamed: 0', axis = 1)

In [3]:
data.head()

In [4]:
data['date'] = '2022/08/31'

In [5]:
from preprocessing import preprocessing

In [6]:
preprocessing(data, NA = 'mean_mode', to_date = 'date', dummies = 'Purpose').info()

In [7]:
data_num = data.select_dtypes(include=np.number) #only numerical columns
data_cat = data.drop(data_num.columns, axis = 1)

In [8]:
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence

from sklearn.ensemble import RandomForestClassifier

In [9]:
path = './data/german_raw.csv'
df = pd.read_csv(path)
df.head()

In [10]:
clf = RandomForestClassifier()

X = preprocessing(df.drop('GoodCustomer', axis = 1), NA = 'mean_mode', normalization=True, norm_type='norm', dummies = 'all').dropna(axis = 1)
y = df['GoodCustomer']

In [11]:
clf.fit(X, y)

In [12]:
partial_dependence(clf, X=X.sample(100), features=[14,16],
                   kind = 'average')['average'][0]

In [13]:
plt.figure(figsize = (20,20))
features = [14, 16] #cada número representa uma feature 
PartialDependenceDisplay.from_estimator(clf, X, features)

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.ensemble import RandomForestRegressor

In [15]:
feature = 'LoanAmount'
feature = 'Age'

pd_results = partial_dependence(
    clf, X, features=feature, kind="average")
#display = PartialDependenceDisplay(
#    [pd_results], features=features, feature_names=feature_names,
#    target_idx=0, deciles=deciles
#)

In [16]:
X.columns

In [17]:
df[feature]

In [18]:
pdp_df = pd.DataFrame({'grid_values': pd_results['values'][0], 'average': pd_results['average'][0]})
pdp_dict = pdp_df.to_dict(orient='records')

#x_start = min(df[feature].min(), pdp_df['grid_values'].min())
#x_end = max(df[feature].max(), pdp_df['grid_values'].max())

x_start = pdp_df['grid_values'].min()
x_end = pdp_df['grid_values'].max()


In [19]:
pdp_df.tail()

In [20]:
import os
os.chdir('./dash_interface/')
from my_dash_components import ScatterplotComponent, LinearplotComponent, HistogramplotComponent, LinearHistComponent
from dash import dcc, html, Input, Output
import pandas as pd
from jupyter_dash import JupyterDash
hostId = 0
def getHost():
    global hostId
    hostId += 1
    return '127.0.0.' + str(hostId)

In [23]:
app = JupyterDash(__name__)

app.layout = html.Div([
    LinearplotComponent(
        id='linear',
        data=pdp_dict,
        x_axis='grid_values',
        y_axis='average',
        xStart=x_start,
        xEnd=x_end,
    ),
    HistogramplotComponent(
        id='hist',
        data=df.to_dict(orient='records'),
        value=feature,
        xStart=x_start,
        xEnd=x_end,
    ),
    html.Div(id='output')
])

clickedValue = ""
@app.callback(
    Output("output", "children"), 
    Input("linear", "value"))
def showDataPoint(value):
    global clickedValue
    clickedValue = value
    return value

if __name__ == '__main__':
    app.run(mode="jupyterlab")
    #app.run_server(mode='inline')
    #app.run_server(mode="inline", host=getHost())

    #verifica o gridvalues e verificar o value do histograma
