In [316]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [3]:
files = ['august.csv', 'september.csv', 'october.csv'] 
PATH ='../data/private/csv/fam/'

df = [pd.read_csv(os.path.join(PATH, file)) 
      for file in files]
df = pd.concat(df, sort=False)

query = '(month == 8 and day >= 27) or (month == 9) or (month == 10 and day <= 7)'
df = df.query(query)
df.head()

Unnamed: 0,time,sysclass,lat,lon,dir,vel,size,ttyyyxx1,ttyyyxx2,ttyyyxx3,...,maxz,meanvil,ttvil,meanprec,maxprec,year,month,day,hour,minute
12871,0.0,0.0,-2.75,-58.57,-999.0,0.0,13.0,0.0,0.0,0.0,...,22.28,0.18,2.31,0.97,1.18,2014,8,27,2,24
12872,0.2,1.0,-2.73,-58.57,0.0,11.1,30.0,0.0,0.0,0.0,...,28.17,0.3,9.1,1.8,4.16,2014,8,27,2,24
12873,0.0,0.0,-2.73,-58.57,-999.0,0.0,21.0,0.0,0.0,0.0,...,23.01,0.18,3.76,1.02,1.32,2014,8,27,3,0
12874,0.2,1.0,-2.75,-58.57,180.0,11.1,13.0,0.0,0.0,0.0,...,22.28,0.18,2.28,0.97,1.09,2014,8,27,3,0
12875,0.0,0.0,-1.71,-59.63,-999.0,0.0,12.0,0.0,0.0,0.0,...,21.46,0.16,1.96,0.92,0.97,2014,8,27,3,48


In [292]:
# columns = [
#     'ttyyyxx3', 'riverfrac', 'convfrac', 'strafrac', 
#     'meanz', 'maxz', 'meanvil', 'ttvil', 'meanprec', 'maxprec',
# ]
columns = [
    'ttyyyxx3', 'riverfrac', 'convfrac', 'strafrac', 
    'maxz', 'ttvil', 'maxprec',
]
threshod = 1
samples = .10
np.random.seed(42)

dfy = df[df['ttyyyxx3'] >= threshod]
dfn = df[df['ttyyyxx3'] < threshod]
samples *=  100
samples = int((dfy[['ttyyyxx3']].count()*samples)/100)

dfy = dfy.sample(samples)
dfn = dfn.sample(samples)

df_ = pd.concat([dfy, dfn], sort=False).sample(samples*2)
X_train, X_test, y_train, y_test = train_test_split(df_[columns[1:]], \
                                                    df_[[columns[0]]], \
                                                    test_size=.2)
dtr = DecisionTreeRegressor(max_leaf_nodes=4).fit(X_train, y_train)
y_pred = dtr.predict(X_test)

In [293]:
fi = dtr.feature_importances_*100
attrs = {}
for key, value in zip(columns[1:], fi):
    attrs[key] = value

labels = list(attrs.keys())
values = list(attrs.values())
colors = ['#DC1F2A', '#001427', '#708D81', '#F4D58D', '#02CC6D', 
         '#02C4EF', '#FAFF05', '#CD05FF', '#FF8205']

fig = go.Figure()
fig.add_trace(
    go.Pie(labels=labels, values=values)
)

fig.update_traces(hoverinfo='label+percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#282828', width=2)))
fig.update_layout(
    template='plotly_dark',
    title="Uso dos atributos pela regressão da árvore de decisão",
    font=dict(size=20)
)
fig.show()

In [343]:
fig = go.Figure()

yesy = dfy['ttvil'].values
yesx = dfy['strafrac'].values

noy = dfn['ttvil'].values
nox = dfn['strafrac'].values

fig.add_trace(
    go.Scatter(x=nox, y=noy, mode='markers', 
               opacity=0.5, marker=dict(color='#79FF05', line=dict(color='#253F54', width=1)),
               name='Subconjunto sem descargas atmosféricas')
)

fig.add_trace(
    go.Scatter(x=yesx, y=yesy, mode='markers', opacity=0.5,
               marker=dict(color='#DC1F2A', line=dict(color='#253F54', width=1)),
               name='Subconjunto com descargas atmosféricas')
)
fig.update_yaxes(
    title_text='Total do índice VIL na família (kg m<sup>-2</sup>)'
)
fig.update_xaxes(
    title_text='Fração da família sobre o rio (%)'
)

fig.update_layout(
    template='plotly_dark',
    title='Disperção dos índices VIL e fração fração estratiforme dos subconjuntos com e sem raios'
)
fig.show()

In [337]:
# columns = [
#     'ttyyyxx3', 'riverfrac', 'convfrac', 'strafrac', 
#     'meanz', 'maxz', 'meanvil', 'ttvil', 'meanprec', 'maxprec',
# ]
columns = [
    'ttyyyxx3', 'maxz', 'ttvil'
]
threshod = 1
samples = 1
np.random.seed(42)

dfy = df[df['ttyyyxx3'] >= threshod]
dfn = df[df['ttyyyxx3'] < threshod]
dfy['label'] = 1
dfn['label'] = 0

samples *=  100
samples = int((dfy[['ttyyyxx3']].count()*samples)/100)

dfy = dfy.sample(samples)
dfn = dfn.sample(samples)

df_ = pd.concat([dfy, dfn], sort=False).sample(samples*2)
X_train, X_test, y_train, y_test = train_test_split(df_[columns[1:]], \
                                                    df_[['label']], \
                                                    test_size=.2)
dtc = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = dtc.predict(X_test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [338]:
fi = dtc.feature_importances_*100
attrs = {}
for key, value in zip(columns[1:], fi):
    attrs[key] = value

labels = list(attrs.keys())
values = list(attrs.values())
colors = ['#DC1F2A', '#001427', '#708D81', '#F4D58D', '#02CC6D', 
         '#02C4EF', '#FAFF05', '#CD05FF', '#FF8205']

fig = go.Figure()
fig.add_trace(
    go.Pie(labels=labels, values=values)
)

fig.update_traces(hoverinfo='label+percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#282828', width=2)))
fig.update_layout(
    template='plotly_dark',
    title="Uso dos atributos pela regressão da árvore de decisão",
    font=dict(size=20)
)
fig.show()

In [342]:
fig = go.Figure()

yesy = dfy['ttvil'].values
yesx = dfy['strafrac'].values

noy = dfn['ttvil'].values
nox = dfn['strafrac'].values

fig.add_trace(
    go.Scatter(x=nox, y=noy, mode='markers', 
               opacity=0.5, marker=dict(color='#79FF05', line=dict(color='#253F54', width=1)),
               name='Subconjunto sem descargas atmosféricas')
)

fig.add_trace(
    go.Scatter(x=yesx, y=yesy, mode='markers', opacity=0.5,
               marker=dict(color='#DC1F2A', line=dict(color='#253F54', width=1)),
               name='Subconjunto com descargas atmosféricas')
)
fig.update_yaxes(
    title_text='Total do índice VIL na família (kg m<sup>-2</sup>)'
)
fig.update_xaxes(
    title_text='Fração da família sobre o rio (%)'
)
x = np.arange(100)+1


fig.update_layout(
    template='plotly_dark',
    title='Disperção dos índices VIL e fração fração estratiforme dos subconjuntos com e sem raios'
)
fig.show()