In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pandas_profiling import ProfileReport
from sklearn.model_selection import KFold

import scipy.stats

In [2]:
DATA_PATH = '../data/'

df = pd.read_csv(DATA_PATH + 'raw/public-train.csv')
df.head()

Unnamed: 0,n,p,f,x,y,z,a1,a2,a3,a4,...,g1,g2,l1,l2,l3,l4,e1,e2,cent_price_cor,cent_trans_cor
0,709,0.7,0.2,16,12,7,3.8,0.24,2.3,0.28,...,0.186,0.5,0.147,1.5,0.089,1.6,1.5,2.6,-0.169,0.375
1,921,0.6,0.3,19,17,10,3.0,0.12,2.4,0.19,...,0.079,1.2,0.186,1.6,0.076,1.8,0.6,0.4,-0.075,0.234
2,177,0.8,0.4,14,12,5,3.2,0.17,1.8,0.18,...,0.036,1.4,0.048,0.7,0.073,0.6,2.7,0.3,-0.177,0.389
3,415,0.7,0.5,24,11,2,1.3,0.17,1.5,0.18,...,0.063,1.1,0.151,0.8,0.022,1.1,0.5,0.1,-0.102,0.358
4,802,0.8,0.4,21,10,3,4.4,0.15,2.6,0.13,...,0.044,1.9,0.123,1.9,0.046,1.1,2.0,0.6,-0.034,0.18


In [3]:
def get_percentile(mean_series, std_series, percentile):
    """Recebe series de media e desvio padrao e retorna uma serie com o percentil escolhido"""
    temp_df = pd.DataFrame({'mean':mean_series, 'std':std_series})
    return temp_df.apply(lambda x: scipy.stats.norm.ppf(percentile, loc=x[0], scale=x[1]), axis=1)

In [4]:
def gerar_features(df):
    # Features espaciais
    df['espaco_total'] = df['x']*df['y']*df['z']
    df['espaco_lateral'] = df['x']*df['y']
    # Features de numeros
    df['n_traders'] = df['n']*df['p']
    df['n_traders_int'] = df['n_traders']*df['f']
    # Combinacoes
    df['agentes_por_esp_total'] = df['n']/df['espaco_total']
    df['agentes_por_esp_lateral'] = df['n']/df['espaco_lateral']
    df['traders_por_esp_total'] = df['n_traders']/df['espaco_total']
    df['traders_por_esp_lateral'] = df['n_traders']/df['espaco_lateral']
    df['traders_int_por_esp_total'] = df['n_traders_int']/df['espaco_total']
    df['traders_int_por_esp_lateral'] = df['n_traders_int']/df['espaco_lateral']
    # Gerando percentis
    pares_media_dp = [('a1', 'a2'), ('a3', 'a4'), ('b1', 'b2'), ('b3', 'b4'), ('g1', 'g2'), ('l1', 'l2'), ('l3', 'l4')]
    percentis = [0.05, 0.25, 0.75, 0.95]

    for par in pares_media_dp:
        for percentil in percentis:
            nome = par[0] + par[1] + str(percentil)
            print(par[0], par[1], percentil)
            df[nome] = get_percentile(df[par[0]],df[par[1]], percentil)

In [5]:
gerar_features(df);

a1 a2 0.05
a1 a2 0.25
a1 a2 0.75
a1 a2 0.95
a3 a4 0.05
a3 a4 0.25
a3 a4 0.75
a3 a4 0.95
b1 b2 0.05
b1 b2 0.25
b1 b2 0.75
b1 b2 0.95
b3 b4 0.05
b3 b4 0.25
b3 b4 0.75
b3 b4 0.95
g1 g2 0.05
g1 g2 0.25
g1 g2 0.75
g1 g2 0.95
l1 l2 0.05
l1 l2 0.25
l1 l2 0.75
l1 l2 0.95
l3 l4 0.05
l3 l4 0.25
l3 l4 0.75
l3 l4 0.95


In [7]:
# Gerando os folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for f, (t_, v_) in enumerate(kf.split(df)):
    df.loc[v_, "kfold"] = f
    
df.groupby('kfold')['n'].count()

kfold
0.0    2388
1.0    2388
2.0    2388
3.0    2388
4.0    2388
Name: n, dtype: int64

In [8]:
df.to_csv(DATA_PATH + 'processed/train_folds.csv', index=False)

## Dados de teste para submissão

In [10]:
df_test = pd.read_csv(DATA_PATH + 'raw/public-test.csv').drop('id', axis=1)
df_test.head()

Unnamed: 0,n,p,f,x,y,z,a1,a2,a3,a4,...,c3,c4,g1,g2,l1,l2,l3,l4,e1,e2
0,558,0.5,0.2,28,21,2,1.0,0.29,1.3,0.21,...,3.1,0.27,0.198,1.8,0.149,2.0,0.078,1.7,0.5,2.9
1,910,0.9,0.3,28,19,9,2.2,0.14,1.6,0.14,...,2.2,0.16,0.172,1.0,0.086,0.8,0.03,0.8,0.2,1.6
2,213,0.8,0.5,21,15,8,2.2,0.22,2.6,0.15,...,1.6,0.2,0.083,0.7,0.019,1.4,0.016,1.2,2.6,2.4
3,654,0.7,0.5,14,15,2,2.2,0.12,3.8,0.27,...,4.1,0.11,0.185,1.3,0.11,0.8,0.006,1.7,0.2,1.7
4,672,0.7,0.5,24,10,5,3.7,0.2,3.7,0.18,...,2.6,0.27,0.158,0.9,0.148,1.9,0.038,1.3,1.1,2.8


In [11]:
gerar_features(df_test)

a1 a2 0.05
a1 a2 0.25
a1 a2 0.75
a1 a2 0.95
a3 a4 0.05
a3 a4 0.25
a3 a4 0.75
a3 a4 0.95
b1 b2 0.05
b1 b2 0.25
b1 b2 0.75
b1 b2 0.95
b3 b4 0.05
b3 b4 0.25
b3 b4 0.75
b3 b4 0.95
g1 g2 0.05
g1 g2 0.25
g1 g2 0.75
g1 g2 0.95
l1 l2 0.05
l1 l2 0.25
l1 l2 0.75
l1 l2 0.95
l3 l4 0.05
l3 l4 0.25
l3 l4 0.75
l3 l4 0.95


In [16]:
df_test.head()

Unnamed: 0,n,p,f,x,y,z,a1,a2,a3,a4,...,g1g20.75,g1g20.95,l1l20.05,l1l20.25,l1l20.75,l1l20.95,l3l40.05,l3l40.25,l3l40.75,l3l40.95
0,558,0.5,0.2,28,21,2,1.0,0.29,1.3,0.21,...,1.412082,3.158737,-3.140707,-1.19998,1.49798,3.438707,-2.718251,-1.068633,1.224633,2.874251
1,910,0.9,0.3,28,19,9,2.2,0.14,1.6,0.14,...,0.84649,1.816854,-1.229883,-0.453592,0.625592,1.401883,-1.285883,-0.509592,0.569592,1.345883
2,213,0.8,0.5,21,15,8,2.2,0.22,2.6,0.15,...,0.555143,1.234398,-2.283795,-0.925286,0.963286,2.321795,-1.957824,-0.793388,0.825388,1.989824
3,654,0.7,0.5,14,15,2,2.2,0.12,3.8,0.27,...,1.061837,2.32331,-1.205883,-0.429592,0.649592,1.425883,-2.790251,-1.140633,1.152633,2.802251
4,672,0.7,0.5,24,10,5,3.7,0.2,3.7,0.18,...,0.765041,1.638368,-2.977222,-1.133531,1.429531,3.273222,-2.10031,-0.838837,0.914837,2.17631


In [18]:
df_test.to_csv(DATA_PATH + 'processed/test_input.csv', index=False)