# Trabajo Práctico N° 2
## Objetivo:

Para cada dispositivo presentado por Jampp, determinar el tiempo que transcurrirá hasta que el mismo aparezca nuevamente en una subasta, y el tiempo hasta que el usuario del mismo decida instalar una nueva aplicación.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import datetime as dt

## Analizo los datos de la primer ventana

In [2]:
auct = pd.read_csv('data/auctions_ventana1.csv', dtype = {"ref_type_id": np.int8, "source_id": np.int8})

In [3]:
auct.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-20 23:57:27.912838,1109595589636746168,7,0
1,2019-04-20 23:57:28.381114,5896614299191635403,1,0
2,2019-04-20 23:57:28.515423,4172466725848941608,1,0
3,2019-04-20 23:57:28.700884,2616279795187318849,7,0
4,2019-04-20 23:57:28.868312,8034952072073026056,1,0


In [4]:
auct['date'] = pd.to_datetime(auct['date'])

In [5]:
auct.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-20 23:57:27.912838,1109595589636746168,7,0
1,2019-04-20 23:57:28.381114,5896614299191635403,1,0
2,2019-04-20 23:57:28.515423,4172466725848941608,1,0
3,2019-04-20 23:57:28.700884,2616279795187318849,7,0
4,2019-04-20 23:57:28.868312,8034952072073026056,1,0


## Veo cuantas veces aparece cada dispositivo en una subasta

In [6]:
dfApT = auct.groupby('device_id').agg({'date': 'count'}).reset_index()
dfApT.columns = ['device_id', 'count_subs']

In [7]:
dfApT.head()

Unnamed: 0,device_id,count_subs
0,41863526108385,35
1,135153013040192,8
2,161514654074162,6
3,181891380775191,1
4,186034136943920,7


## Veo la cantidad de veces que aparece cada dispositivo segun el source_id

In [8]:
auct['apariciones'] = 1

In [9]:
dfA = auct.groupby(['device_id', 'source_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfA.columns = dfA.columns.droplevel(0)

In [10]:
dfA.columns = ['apariciones_source_0', 'count_src_1', 'count_src_2', 'count_src_3', 'count_src_4', 'count_src_5', 'count_src_6', 'count_src_7', 'count_src_8', 'count_src_9']

In [11]:
dfA.head()

Unnamed: 0_level_0,apariciones_source_0,count_src_1,count_src_2,count_src_3,count_src_4,count_src_5,count_src_6,count_src_7,count_src_8,count_src_9
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
41863526108385,0.0,0.0,0.0,28.0,0.0,3.0,0.0,0.0,4.0,0.0
135153013040192,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161514654074162,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
181891380775191,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186034136943920,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Veo la cantidad de veces que aparece cada dispositivo en una subasta según el ref_type

In [12]:
auct['ref_type_id'].value_counts()

1    13313564
7     2331211
Name: ref_type_id, dtype: int64

In [13]:
dfApRef = auct.groupby(['device_id', 'ref_type_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApRef.columns = dfApRef.columns.droplevel(0)
dfApRef.columns = ['count_ref_1', 'count_ref_7']
dfApRef.head()

Unnamed: 0_level_0,count_ref_1,count_ref_7
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
41863526108385,35.0,0.0
135153013040192,8.0,0.0
161514654074162,6.0,0.0
181891380775191,1.0,0.0
186034136943920,7.0,0.0


## Promedio de apariciones de un dispositivo por día

In [14]:
auct['fecha'] = auct['date'].dt.date

In [17]:
dfApDay = auct.groupby(['device_id', 'fecha']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApDay.columns = dfApDay.columns.droplevel(0)

In [20]:
dfApDay.head()

fecha,device_id,2019-04-18,2019-04-19,2019-04-20
0,41863526108385,0.0,1.0,34.0
1,135153013040192,0.0,0.0,8.0
2,161514654074162,6.0,0.0,0.0
3,181891380775191,0.0,0.0,1.0
4,186034136943920,2.0,0.0,5.0
5,283297668933729,1.0,0.0,0.0
6,345999128501141,45.0,2.0,1.0
7,360710529886978,9.0,1.0,3.0
8,365882020742330,0.0,4.0,1.0
9,407152743717620,1.0,0.0,0.0
