In [252]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LassoCV
import xgboost as xgb
import lightgbm as lgb
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation
from keras import backend as K
from sklearn.ensemble import RandomForestRegressor

In [257]:
data = pd.read_pickle('data/model/model3_w5.pkl')
target = pd.read_csv('data/target_competencia_ids.csv')
target_obj = pd.read_csv('data/target_competencia_ids.csv')

In [258]:
target['ref_hash'] = target['ref_hash'].apply(lambda x: int(str(x)[:-3]))
target.drop_duplicates('ref_hash', inplace=True)

In [259]:
features = {'n_auctions':0, 'diff_auctions': 'mean', 'mean_time_auction': 'mean', 'first_auction_sec': 'mean',
            'last_auction_sec': 'mean', 'ref_type_id_1': 0, 'ref_type_id_7': 0, 'source_id_0': 0, 'source_id_1': 0, 'source_id_2':0,
            'source_id_3':0, 'source_id_4':0, 'source_id_5':0, 'source_id_6':0,
            'source_id_7':0, 'source_id_8':0, 'source_id_9':0, 'mean_day': 'mean', 'day_0': 0,'day_1': 0, 'day_2': 0,
            'n_events': 0, 'mean_time_events': 'mean', 'wifi_events_mean': 'mean', 'diff_events': 0, 'first_event_sec': 0,
           'last_event_sec': 0, 'attributed_events_mean': 0, 'touchX_mean': 'mean', 'touchY_mean': 'mean', 'timeToClick_mean': 'mean',
           'latitude_mean': 'mean', 'longitude_mean': 'mean', 'n_clicks': 0, 'mean_time_install': 0, 'timeToClick_mean': 'mean',
           'first_install_sec': 0, 'last_install_sec': 0, 'n_installs': 0, 'mean_time_install': 'mean', 'attributed_installs_mean': 'mean',
           'wifi_installs_mean': 'mean', 'diff_installs': 'mean', 'mean_time_install': 'mean', 'wifi_connection': 'mean', 'first_click_sec': 'mean',
           'last_click_sec': 'mean', 'diff_clicks': 'mean', 'mean_time_click': 'mean'}

for feature, method in features.items():
    if feature not in data:
        continue
    if method == 0:
        data[feature] = data[feature].fillna(0)
    elif method == 'mean':
        data[feature] = data[feature].fillna(data[feature].mean())
    elif method == 'max':
        data[feature] = data[feature].fillna(data[feature].max())
    elif method == 'min':
        data[feature] = data[feature].fillna(data[feature].min())
    elif method == -1:
        data[feature] = data[feature].fillna(-1)

In [260]:
pd.merge(data, target, on='ref_hash', how='inner').shape

(4037, 48)

In [261]:
target.head(10)

Unnamed: 0,ref_hash,obj
0,1000169251625791246,0
2,1000395625957344683,0
4,1003027494996471685,0
6,1006670001679961544,0
8,1007573308966476713,0
10,1010070503877148763,0
12,1010265377387765028,0
14,1010531372912327058,0
16,1011610998357271358,0
18,1013543838965040946,0


In [177]:
data[data['ref_hash'] == 1013543838965040946]

Unnamed: 0,ref_hash,n_auctions,diff_auctions,mean_time_auction,first_auction_sec,last_auction_sec,ref_type_id_1,ref_type_id_7,source_id_0,source_id_1,...,wifi_events_mean,diff_events,mean_time_events,first_install_sec,last_install_sec,n_installs,attributed_installs_mean,wifi_installs_mean,diff_installs,mean_time_install


In [247]:
data

Unnamed: 0,ref_hash,n_auctions,diff_auctions,mean_time_auction,first_auction_sec,last_auction_sec,ref_type_id_1,ref_type_id_7,source_id_0,source_id_1,...,wifi_events_mean,diff_events,mean_time_events,first_install_sec,last_install_sec,n_installs,attributed_installs_mean,wifi_installs_mean,diff_installs,mean_time_install
0,1384623003476985856,1.0,0.000000,0.000000,258749.135354,258749.135354,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
1,3714738743084512256,303.0,257703.955464,853.324356,1081.086166,258785.041630,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
2,5697386557321863168,6.0,72611.156437,14522.231287,186163.557924,258774.714361,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
3,5583037045722621952,35.0,236390.702008,6952.667706,22403.027827,258793.729835,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
4,6383034009915294720,68.0,208095.721478,3105.906291,50732.855637,258828.577115,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
5,2939733907799209984,49.0,251049.453170,5230.196941,7856.244607,258905.697777,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
6,2476206265570189824,494.0,259095.099817,525.547870,24.269497,259119.369314,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
7,7451197314471370752,483.0,204029.516448,423.297752,55133.668221,259163.184669,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
8,4216292216842097152,8.0,248100.122366,35442.874624,10923.727886,259023.850252,1.0,0.0,0.0,0.0,...,1.000000,0.000,0.000000,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492
9,7489206183522373632,34.0,234226.097022,7097.760516,24962.274311,259188.371333,1.0,0.0,0.0,0.0,...,0.332447,0.000,10176.207236,0.000,0.000,0.0,0.000731,0.803085,3508.214071,2867.662492


In [153]:
data.shape

(580977, 47)

In [154]:
target.shape

(4037, 2)

In [127]:
data = pd.merge(data, target, how='inner', on='ref_hash')

In [130]:
predictions_auctions = data[['ref_hash']]
predictions_installs = data[['ref_hash']]
predictions_auctions['ref_hash'] = predictions_auctions['ref_hash'].apply(lambda x: str(int(x)) + '_st')
predictions_installs['ref_hash'] = predictions_installs['ref_hash'].apply(lambda x: str(int(x)) + '_sc')
data.drop(columns=['ref_hash'], inplace=True)
model_auctions = lgb.Booster(model_file='modelos/lightgbm_auctions')
model_installs = lgb.Booster(model_file='modelos/lightgbm_installs')
predictions_auctions['obj'] = model_auctions.predict(data)
predictions_installs['obj'] = model_installs.predict(data)
predictions = pd.concat([predictions_auctions, predictions_installs])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [133]:
pd.merge(predictions, target_obj, on='ref_hash', how='inner')

Unnamed: 0,ref_hash,obj_x,obj_y
0,8027109759910869730_st,104556.496909,0
1,3805512975348983658_st,50106.689577,0
2,706875581985023190_st,45415.035473,0
3,9201763056911976665_st,24985.632252,0
4,2070001883938629880_st,35616.352910,0
5,2956299000597738624_st,24985.632252,0
6,5051062186658844309_st,37017.731235,0
7,3729857814892336524_st,34078.667389,0
8,8048087799114816623_st,103292.039148,0
9,7988921706433140919_st,37272.118432,0
