In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import urllib.request
import matplotlib.pyplot as plt
from scipy.stats import zscore

from sklearn.preprocessing import StandardScaler

In [2]:
#list_metricas = ['average_clustering', 'coefficient_distribution_degree', 'density', 'average_degree', 'average_short_path']#, 'number_of_nodes']
list_metricas = ['average_clustering', 'coefficient_distribution_degree', 'density', 'average_degree']

list_variables_a = [
    'Patrimonio_Liquido',
    'Cotas_Emitidas',
    'Valor_Patrimonial_Cotas',
    'Percentual_Rentabilidade_Efetiva_Mes',
    'Percentual_Rentabilidade_Patrimonial_Mes',
    'Percentual_Dividend_Yield_Mes',
    'Percentual_Amortizacao_Cotas_Mes',
]


list_variables_b = [
    'Valor_Ativo',
    'Percentual_Despesas_Taxa_Administracao',
    'Percentual_Despesas_Agente_Custodiante',
    'Total_Necessidades_Liquidez',
    'Disponibilidades',
    'Titulos_Publicos',
    'Titulos_Privados',
    'Fundos_Renda_Fixa',
    'Total_Investido',
    'Direitos_Bens_Imoveis',
    'Terrenos',
    'Imoveis_Renda_Acabados',
    'Imoveis_Renda_Construcao',
    'Imoveis_Venda_Acabados',
    'Imoveis_Venda_Construcao',
    'Outros_Direitos_Reais',
    'Acoes',
    'Debentures',
    'Bonus_Subscricao',
    'Certificados_Deposito_Valores_Mobiliarios',
    'Cedulas_Debentures',
    'Fundo_Acoes',
    'FIP',
    'FII',
    'FDIC',
    'Outras_Cotas_FI',
    'Notas_Promissorias',
    'Acoes_Sociedades_Atividades_FII',
    'Cotas_Sociedades_Atividades_FII',
    'CEPAC',
    'CRI',
    'Letras_Hipotecarias',
    'LCI',
    'LIG',
    'Outros_Valores_Mobliarios',
    'Valores_Receber',
    'Contas_Receber_Aluguel',
    'Contas_Receber_Venda_Imoveis',
    'Outros_Valores_Receber',
    'Rendimentos_Distribuir',
    'Taxa_Administracao_Pagar',
    'Taxa_Performance_Pagar',
    'Obrigacoes_Aquisicao_Imoveis',
    'Adiantamento_Venda_Imoveis',
    'Adiantamento_Alugueis',
    'Obrigacoes_Securitizacao_Recebiveis',
    'Instrumentos_Financeiros_Derivativos',
    'Provisoes_Contigencias',
    'Outros_Valores_Pagar',
    'Total_Passivo',
 ]

list_variables = list_variables_a + list_variables_b

list_cat = [
    'Mandato',
    'Segmento_Atuacao',
    #'Publico_Alvo',
    #'Tipo_Gestao',
]

ds_fii = xr.open_dataset('dataset/cvm/dataset_cvm_metrica_mensal.nc').fillna(0)
df_dados_fii = pd.read_csv('dataset/cvm/dataset_cvm_metrica_mensal_dados.csv', index_col='CNPJ_Fundo')

df_fii = ds_fii.to_array().stack(var=['variable', 'metrica']).to_pandas()

df_fii['TIR'] = df_dados_fii['TIR'].fillna(-1)
df_fii_cat =  df_dados_fii[list_cat]


cnpj_meses = xr.open_dataset('dataset/cvm/dataset_cvm_metrica_mensal.nc').fillna(0).Acoes.sel(metrica='number_of_nodes').to_pandas()

In [3]:
def dataset_filter(df, lista_variaveis, lista_metricas, lista_cnpj):
    cols = [ (v, m) for v in lista_variaveis for m in lista_metricas ] + [ ('TIR', '') ]
    df_out = df[df.index.isin(lista_cnpj)][ cols ]
    return df_out.drop(columns=[ ('TIR', '') ]), df_out[ [ ('TIR', '') ] ]

In [4]:
def select_cols(numbers):
    #list_cat = []
    grupos_fiis = df_dados_fii[list_cat].dropna().agg('-'.join, axis=1)

    set_cols = set()

    for grupo in np.unique(grupos_fiis.values):
        cnpj_grupo = grupos_fiis[grupos_fiis==grupo].index.values


        df_dados = df_fii[df_fii.index.isin(cnpj_grupo)]
        corr = df_dados.corr()
        cols = corr['TIR'][corr['TIR'].abs() >= 0.8].abs().sort_values(ascending=False).index.values[1:(numbers+1)]

        for t in cols:
            set_cols.add(t)

    set_cols.discard(('TIR', ''))
    return list(set_cols)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

preprocessing = OneHotEncoder().fit_transform(df_fii_cat[df_fii_cat.index.isin(df_fii.index.values)]).toarray()

score = {}
score_cat = {}

last_num_cols = 0
cols_corr = select_cols(5)

for k in range(1):
    target = df_fii[('TIR','')]
    std_target = target.fillna(-1.).values.flatten()

    #models = [ RandomForestRegressor(n_jobs=-1, oob_score=True, max_leaf_nodes=2, max_depth=5) ]
    models = [ RandomForestRegressor(n_jobs=-1) ]
    #models = [ LinearRegression(), ARDRegression(), RidgeCV(), SVR(C=10.0) ]
    #models = [ MLPRegressor(max_iter=5000) ]
    clf = []
    
    for j in range(len(cols_corr)):
        features = df_fii[ cols_corr ].values[:, :j+1]
        features_cat = np.hstack((features, preprocessing))
        print(f'---{j+1}: {cols_corr[j]}---')
        
        for i, model in enumerate(models):
            #clf.append(make_pipeline(StandardScaler(), model))
            clf.append(make_pipeline( model))
            score_train = []
            score_test = []
            score[clf[i][0]] = score.get(clf[i][0], {})

            for _ in range(10):
                X_train, X_test, y_train, y_test = train_test_split(features, std_target, test_size=1/3)

                clf[i].fit(X_train, y_train)

                score_train_i = mean_absolute_error(y_train, clf[i].predict(X_train))
                score_test_i = mean_absolute_error(y_test, clf[i].predict(X_test))

                score_train.append(score_train_i)
                score_test.append(score_test_i)
            
            print(f'{np.mean(score_train):.3f}, {np.mean(score_test):.3f}')

            #score[clf[i].steps[1][0]][features.shape[1]] = score[clf[i].steps[1][0]].get(features.shape[1], {'train': score_train, 'test': score_test})
            score[clf[i][0]][j] = score[clf[i][0]].get(j, {'train': score_train, 'test': score_test})



            score_train = []
            score_test = []
            score_cat[clf[i][0]] = score_cat.get(clf[i][0], {})

            for _ in range(10):
                X_train, X_test, y_train, y_test = train_test_split(features_cat, std_target, test_size=0.33)

                clf[i].fit(X_train, y_train)

                score_train_i = mean_absolute_error(y_train, clf[i].predict(X_train))
                score_test_i = mean_absolute_error(y_test, clf[i].predict(X_test))

                score_train.append(score_train_i)
                score_test.append(score_test_i)
            
            print(f'{np.mean(score_train):.3f}, {np.mean(score_test):.3f}')

            #score_cat[clf[i].steps[1][0]][features_cat.shape[1]] = score_cat[clf[i].steps[1][0]].get(features_cat.shape[1], {'train': score_train, 'test': score_test})
            score_cat[clf[i][0]][j] = score_cat[clf[i][0]].get(j, {'train': score_train, 'test': score_test})
    
    last_num_cols = len(cols_corr)

---1: ('Outros_Valores_Pagar', 'number_of_edges')---
3838.847, 4166.151
1287.915, 5243.553
---2: ('Fundos_Renda_Fixa', 'average_degree')---
2304.279, 4727.704
1453.521, 7498.078
---3: ('Quantidade_Cotas_Emitidas', 'density')---
1558.264, 3812.406
1854.148, 3923.974
---4: ('Taxa_Administracao_Pagar', 'density')---
1405.221, 4171.369
1529.184, 5601.093
---5: ('Numero_Cotistas_Pessoa_Fisica', 'number_of_edges')---
1678.902, 3830.096
1908.883, 5687.930
---6: ('Numero_Cotistas_Pessoa_Juridica_Nao_Financeira', 'average_short_path')---
1242.072, 5128.920
2353.427, 2822.762
---7: ('Numero_Cotistas_FII', 'density')---
2192.771, 3107.878
1892.473, 4126.456
---8: ('Percentual_Despesas_Taxa_Administracao', 'average_clustering')---
2323.171, 3555.588
1908.452, 4911.644
---9: ('Fundos_Renda_Fixa', 'average_clustering')---
1721.886, 4178.301
2117.962, 4581.365
---10: ('Percentual_Despesas_Taxa_Administracao', 'number_of_edges')---
2048.013, 3656.620
1791.665, 4718.728
---11: ('Titulos_Publicos', 'ave

In [None]:
n = len(score)

fig, ax = plt.subplots(n, 2, figsize=(15, n*5))

for i in range(n):
    m = list(score.keys())[i]
    x_features = []
    y_train_mean = []
    y_train_std = []
    y_test_mean = []
    y_test_std = []

    for k in score[m]:
        x_features.append(k)
        y_train_mean.append(np.mean(score[m][k]['train']))
        y_train_std.append(np.std(score[m][k]['train']))
        y_test_mean.append(np.mean(score[m][k]['test']))
        y_test_std.append(np.std(score[m][k]['test']))

    x_features = np.asarray(x_features)
    y_train_mean = np.asarray(y_train_mean)
    y_train_std = np.asarray(y_train_std)
    y_test_mean = np.asarray(y_test_mean)
    y_test_std = np.asarray(y_test_std)

    if n == 1:
        ax[0].plot(x_features, y_test_mean, label='Teste', color='tab:blue')
        ax[0].fill_between(x_features, y_test_mean-y_test_std, y_test_mean+y_test_std, alpha=0.2, color='tab:blue')

        ax[0].plot(x_features, y_train_mean, label='Treino', color='tab:orange')
        ax[0].fill_between(x_features, y_train_mean-y_train_std, y_train_mean+y_train_std, alpha=0.2, color='tab:orange')

        ax[0].set_xlabel('Quantidade de variáveis')
        ax[0].set_ylabel('R^2')
        #ax[0].set_yscale('log')
        ax[0].set_ylim((-1, 1))
        ax[0].set_title('Sem Categorias')
        ax[0].legend(loc='best')
    
    else:
        ax[i][0].plot(x_features, y_test_mean, label='Teste', color='tab:blue')
        ax[i][0].fill_between(x_features, y_test_mean-y_test_std, y_test_mean+y_test_std, alpha=0.2, color='tab:blue')

        ax[i][0].plot(x_features, y_train_mean, label='Treino', color='tab:orange')
        ax[i][0].fill_between(x_features, y_train_mean-y_train_std, y_train_mean+y_train_std, alpha=0.2, color='tab:orange')

        ax[i][0].set_xlabel('Quantidade de variáveis')
        ax[i][0].set_ylabel('R^2')
        #ax[i][0].set_yscale('log')
        ax[i][0].set_ylim((-1, 1))
        ax[i][0].set_title('Sem Categorias')
        ax[i][0].legend(loc='best')


for i in range(n):
    m = list(score_cat.keys())[i]
    x_features = []
    y_train_mean = []
    y_train_std = []
    y_test_mean = []
    y_test_std = []

    for k in score_cat[m]:
        x_features.append(k)
        y_train_mean.append(np.mean(score_cat[m][k]['train']))
        y_train_std.append(np.std(score_cat[m][k]['train']))
        y_test_mean.append(np.mean(score_cat[m][k]['test']))
        y_test_std.append(np.std(score_cat[m][k]['test']))

    x_features = np.asarray(x_features)
    y_train_mean = np.asarray(y_train_mean)
    y_train_std = np.asarray(y_train_std)
    y_test_mean = np.asarray(y_test_mean)
    y_test_std = np.asarray(y_test_std)

    if n == 1:
        ax[1].plot(x_features, y_test_mean, label='Teste', color='tab:blue')
        ax[1].fill_between(x_features, y_test_mean-y_test_std, y_test_mean+y_test_std, alpha=0.2, color='tab:blue')

        ax[1].plot(x_features, y_train_mean, label='Treino', color='tab:orange')
        ax[1].fill_between(x_features, y_train_mean-y_train_std, y_train_mean+y_train_std, alpha=0.2, color='tab:orange')

        ax[1].set_xlabel('Quantidade de variáveis')
        ax[1].set_ylabel('R^2')
        #ax[1].set_yscale('log')
        ax[1].set_ylim((-1, 1))
        ax[1].set_title(f'Com Categorias')
        ax[1].legend(loc='best')

    else:
        ax[i][1].plot(x_features, y_test_mean, label='Teste', color='tab:blue')
        ax[i][1].fill_between(x_features, y_test_mean-y_test_std, y_test_mean+y_test_std, alpha=0.2, color='tab:blue')

        ax[i][1].plot(x_features, y_train_mean, label='Treino', color='tab:orange')
        ax[i][1].fill_between(x_features, y_train_mean-y_train_std, y_train_mean+y_train_std, alpha=0.2, color='tab:orange')

        ax[i][1].set_xlabel('Quantidade de variáveis')
        ax[i][1].set_ylabel('R^2')
        #ax[i][1].set_yscale('log')
        ax[i][1].set_ylim((-1, 1))
        ax[i][1].set_title(f'Com Categorias')
        ax[i][1].legend(loc='best')


plt.tight_layout()
plt.show()