In [1]:
import pandas as pd
import numpy as np

#  Carga de datos

In [2]:
auctions = pd.read_csv("auctions.csv", dtype={"ref_type_id": np.uint8, "source_id": np.uint8})
auctions["date"] =  pd.to_datetime(auctions["date"], errors = "coerce")

In [3]:
clicks = pd.read_csv('clicks.csv', low_memory=False)
clicks["created"] =  pd.to_datetime(clicks["created"], errors = "coerce")

In [None]:
events = pd.read_csv('events.csv', low_memory=False)
events["date"] =  pd.to_datetime(events["date"], errors = "coerce")

In [4]:
installs = pd.read_csv('installs.csv', low_memory = False)
installs["created"] =  pd.to_datetime(installs["created"], errors = "coerce") 

# Partición de los datos por ventanas

In [5]:
#la fecha final debe ser las 00hs del día siguiente a la fecha final de la ventana
def create_window(df, initial_date, final_date, date_feature_name):
    initial = pd.to_datetime(initial_date)
    final = pd.to_datetime(final_date)
    return df.loc[ (df[date_feature_name] >= initial) \
                  & (df[date_feature_name] < final) ]

In [6]:
auctions_18_20 =  create_window(auctions, "2019-04-18", "2019-04-21", "date")
auctions_19_21 =  create_window(auctions, "2019-04-19", "2019-04-22", "date")
auctions_20_22 =  create_window(auctions, "2019-04-20", "2019-04-23", "date")
auctions_21_23 =  create_window(auctions, "2019-04-21", "2019-04-24", "date")
auctions_22_24 =  create_window(auctions, "2019-04-22", "2019-04-25", "date")

In [None]:
events_18_20 =  create_window(events, "2019-04-18", "2019-04-21", "date")
events_19_21 =  create_window(events, "2019-04-19", "2019-04-22", "date")
events_20_22 =  create_window(events, "2019-04-20", "2019-04-23", "date")
events_21_23 =  create_window(events, "2019-04-21", "2019-04-24", "date")
events_22_24 =  create_window(events, "2019-04-22", "2019-04-25", "date")

In [7]:
clicks_18_20 =  create_window(clicks, "2019-04-18", "2019-04-21", "created")
clicks_19_21 =  create_window(clicks, "2019-04-19", "2019-04-22", "created")
clicks_20_22 =  create_window(clicks, "2019-04-20", "2019-04-23", "created")
clicks_21_23 =  create_window(clicks, "2019-04-21", "2019-04-24", "created")
clicks_22_24 =  create_window(clicks, "2019-04-22", "2019-04-25", "created")

In [8]:
installs_18_20 =  create_window(installs, "2019-04-18", "2019-04-21", "created")
installs_19_21 =  create_window(installs, "2019-04-19", "2019-04-22", "created")
installs_20_22 =  create_window(installs, "2019-04-20", "2019-04-23", "created")
installs_21_23 =  create_window(installs, "2019-04-21", "2019-04-24", "created")
installs_22_24 =  create_window(installs, "2019-04-22", "2019-04-25", "created")

# Inicialización de df de features

In [9]:
#le pasamos a esta funcion el dataframe de la ventana a la cual deseemos agregarle features posteriormente
def initialize_trainning_df(df, index_name):
    trainning_df = pd.DataFrame(df[index_name].unique())
    trainning_df.columns = ['ref_hash']
    trainning_df = trainning_df.set_index('ref_hash')
    return trainning_df

In [15]:
example = initialize_trainning_df(installs_18_20, "ref_hash")
example.head(3)

5230323462636548010
5097163995161606833
6328027616411983332


# Sección para añadir features

> *¡Atención! Antes de usar la función add_feature, setear como indice el id de los dispositivos en el df que contiene el feature que se desea agregar*

In [10]:
def charge_df(csv_name, dict_dtypes = None):
    return pd.read_csv(csv_name, dict_dtypes)

In [70]:
def add_feature(df_features, df_to_join, feature_name, fill = 0):
    df_features = df_features.merge(df_to_join[[feature_name]], how = 'left', left_index = True, right_index = True).\
    fillna(fill)
    return df_features

Para no tener que volver a computar las operaciones, una vez agregados los features podemos guardar el estado final del df de features

In [12]:
def save_changes(df, file_name):
    df.to_csv(file_name, index = True)

## Auctions

In [71]:
auc_18_20_features = initialize_trainning_df(auctions_18_20, "device_id")
auc_18_20_features.head(1)

1109595589636746168


> ***Feature 1: cantidad de veces que el usuario aparece en una subasta***

In [72]:
auctions_18_20['appearances_in_auctions'] = 1
feature1 = auctions_18_20.groupby('device_id').agg({'appearances_in_auctions': 'count'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [73]:
auc_18_20_features = add_feature(auc_18_20_features, feature1, "appearances_in_auctions")

In [83]:
auc_18_20_features.head(2)

Unnamed: 0_level_0,appearances_in_auctions,number_of_clicks
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1
1109595589636746168,128,0.0
5896614299191635403,46,0.0


In [84]:
save_changes(auc_18_20_features, "auc_18_20_features.csv")

> ***Feature 2: cantidad de clicks realizados por el usuario***

In [68]:
clicks_18_20['number_of_clicks'] = 1
feature2 = clicks_18_20.groupby('ref_hash').agg({'number_of_clicks': 'count'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [76]:
auc_18_20_features = add_feature(auc_18_20_features, feature2, "number_of_clicks")

In [85]:
save_changes(auc_18_20_features, "auc_18_20_features.csv")

## Installs

Feature 1