In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
public_train_df = pd.read_csv("../data/raw/public-train.csv")
public_train_df.head()

Unnamed: 0,n,p,f,x,y,z,a1,a2,a3,a4,...,g1,g2,l1,l2,l3,l4,e1,e2,cent_price_cor,cent_trans_cor
0,709,0.7,0.2,16,12,7,3.8,0.24,2.3,0.28,...,0.186,0.5,0.147,1.5,0.089,1.6,1.5,2.6,-0.169,0.375
1,921,0.6,0.3,19,17,10,3.0,0.12,2.4,0.19,...,0.079,1.2,0.186,1.6,0.076,1.8,0.6,0.4,-0.075,0.234
2,177,0.8,0.4,14,12,5,3.2,0.17,1.8,0.18,...,0.036,1.4,0.048,0.7,0.073,0.6,2.7,0.3,-0.177,0.389
3,415,0.7,0.5,24,11,2,1.3,0.17,1.5,0.18,...,0.063,1.1,0.151,0.8,0.022,1.1,0.5,0.1,-0.102,0.358
4,802,0.8,0.4,21,10,3,4.4,0.15,2.6,0.13,...,0.044,1.9,0.123,1.9,0.046,1.1,2.0,0.6,-0.034,0.18


In [4]:
X_colums = ['n', 'p', 'f', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'b3',
       'b4', 'c1', 'c2', 'c3', 'c4', 'g1', 'g2', 'l1', 'l2', 'l3', 'l4', 'e1',
       'e2']
Y_columns = ["cent_price_cor", "cent_trans_cor"]

X = public_train_df.loc[:, X_colums].values
Y = public_train_df.loc[:, Y_columns].values

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [20]:
columns_index_map = { }

for i, name in zip(range(len(X_colums)), X_colums):
    columns_index_map[name] = i


def gen_dataset(X, Y, index_map, n_samples):
    X_novo = []
    y_cent_price_cor_novo = []
    y_cent_trans_cor_novo = []

    for i, x in zip(range(len(X)), X):
        preco_fundamental_consumidor = np.random.normal(x[index_map["a1"]], x[index_map["a2"]], n_samples)
        preco_fundamental_traders = np.random.normal(x[index_map["a3"]], x[index_map["a4"]], n_samples)
        preco_mercado_consumidor = np.random.normal(x[index_map["b1"]], x[index_map["b2"]], n_samples)
        preco_mercado_traders = np.random.normal(x[index_map["b3"]], x[index_map["b4"]], n_samples)
        preco_aleatorio_consumidor = np.random.normal(x[index_map["c1"]], x[index_map["c2"]], n_samples)
        preco_aleatorio_trader = np.random.normal(x[index_map["c3"]], x[index_map["c4"]], n_samples)
        pgressividade_consumidor = np.random.normal(x[index_map["g1"]], x[index_map["g2"]], n_samples)
        pesvalorizacao_consumidor = np.random.normal(x[index_map["l1"]], x[index_map["l2"]], n_samples) 
        pesvalorizacao_trader = np.random.normal(x[index_map["l3"]], x[index_map["l4"]], n_samples) 


        for j in range(n_samples):
            x_novo = [
                x[index_map["n"]],
                x[index_map["p"]],
                x[index_map["f"]],
                x[index_map["x"]],
                x[index_map["y"]],
                x[index_map["z"]],
                preco_fundamental_consumidor[j],
                preco_fundamental_traders[j],
                preco_mercado_consumidor[j],
                preco_mercado_traders[j],
                preco_aleatorio_consumidor[j],
                preco_aleatorio_trader[j],
                pgressividade_consumidor[j],
                pesvalorizacao_consumidor[j],
                pesvalorizacao_trader[j], 
                x[index_map["e1"]],
                x[index_map["e2"]]
            ]

            X_novo.append(x_novo)
            y_cent_price_cor_novo.append(Y[i][0])
            y_cent_trans_cor_novo.append(Y[i][1])

    return np.array(X_novo), np.array(y_cent_price_cor_novo), np.array(y_cent_trans_cor_novo)

In [26]:
X_train_inflated, y_price_train, y_trans_train = gen_dataset(X_train, Y_train, columns_index_map, 10)
X_train_inflated.shape, y_price_train.shape, y_trans_train.shape

((89550, 17), (89550,), (89550,))

In [64]:
def predict(X, index_map, n_samples, model):
    predictions = []

    for i, x in zip(range(len(X)), X):
        X_novo = []
        preco_fundamental_consumidor = np.random.normal(x[index_map["a1"]], x[index_map["a2"]], n_samples)
        preco_fundamental_traders = np.random.normal(x[index_map["a3"]], x[index_map["a4"]], n_samples)
        preco_mercado_consumidor = np.random.normal(x[index_map["b1"]], x[index_map["b2"]], n_samples)
        preco_mercado_traders = np.random.normal(x[index_map["b3"]], x[index_map["b4"]], n_samples)
        preco_aleatorio_consumidor = np.random.normal(x[index_map["c1"]], x[index_map["c2"]], n_samples)
        preco_aleatorio_trader = np.random.normal(x[index_map["c3"]], x[index_map["c4"]], n_samples)
        pgressividade_consumidor = np.random.normal(x[index_map["g1"]], x[index_map["g2"]], n_samples)
        pesvalorizacao_consumidor = np.random.normal(x[index_map["l1"]], x[index_map["l2"]], n_samples) 
        pesvalorizacao_trader = np.random.normal(x[index_map["l3"]], x[index_map["l4"]], n_samples) 

        for j in range(n_samples):
            x_novo = [
                x[index_map["n"]],
                x[index_map["p"]],
                x[index_map["f"]],
                x[index_map["x"]],
                x[index_map["y"]],
                x[index_map["z"]],
                preco_fundamental_consumidor[j],
                preco_fundamental_traders[j],
                preco_mercado_consumidor[j],
                preco_mercado_traders[j],
                preco_aleatorio_consumidor[j],
                preco_aleatorio_trader[j],
                pgressividade_consumidor[j],
                pesvalorizacao_consumidor[j],
                pesvalorizacao_trader[j], 
                x[index_map["e1"]],
                x[index_map["e2"]]
            ]

            X_novo.append(x_novo)

        y_predict = model.predict(X_novo)
        predictions.append(np.mean(y_predict))

    return predictions