In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [3]:
dtypes_dict = {'created':'str',  'ref_type': 'category', 'device_brand': np.float32, \
                'device_countrycode': 'category', 'device_language': 'category', \
                'ref_hash': 'category', 'kind': 'str', 'user_agent': 'str'}

installs = pd.read_csv('data/installs.csv.gzip', dtype=dtypes_dict, low_memory=False)

installs = installs.drop('device_countrycode', axis=1)
installs['created'] = pd.to_datetime(installs['created'], format='%Y-%m-%d %H:%M:%S')

In [4]:
installs = installs.drop(columns=['device_language', 'ip_address', 'event_uuid', 'device_brand', \
                         'device_model', 'click_hash', 'session_user_agent'])

In [5]:
# Normalizacion de valores a binario

installs['wifi'] = installs['wifi'].map({True: 1, False: 0})
installs['wifi'] = installs['wifi'].fillna(-1)

installs['attributed'] = installs['attributed'].map({True: 1, False: 0})
installs['implicit'] = installs['implicit'].map({True: 1, False: 0})

In [6]:
installs['ref_type_189'] = (installs['ref_type'].astype(str).str.contains("1891515180541284343") == True).astype(int)
installs['ref_type_149'] = (installs['ref_type'].astype(str).str.contains("1494519392962156891") == True).astype(int)

installs = installs.drop('ref_type', axis=1)

In [7]:
# features de dias

installs['weekday'] = installs['created'].dt.weekday

installs = pd.get_dummies(installs, columns=['weekday'])

# installs = installs.rename(index=str, columns={"weekday_0": "lunes", "weekday_1": "martes", "weekday_2": "miercoles", \
#                              "weekday_3": "jueves", "weekday_4": "viernes", "weekday_5": "sabado", \
#                              "weekday_6": "domingo"})

In [8]:
# feature horarios

bins = [0, 6, 13, 17, 25]
labels = ['morning','afternoon','evening','night']
hours = installs['created'].dt.hour
installs['time'] = pd.cut(hours-5+24 *(hours<5),bins=bins,labels=labels,right=False)

installs["time_day"] = labelencoder.fit_transform(installs["time"])
installs = pd.get_dummies(installs, columns=['time'])

In [9]:
# installs kinds to features, creo que no sirve

installs['kind_open'] = (installs['kind'].astype(str).str.lower().str.contains("open|abertura") == True).astype(int)
installs['kind_home'] = (installs['kind'].astype(str).str.lower().str.contains("home") == True).astype(int)
installs['kind_start'] = (installs['kind'].astype(str).str.lower().str.contains("start") == True).astype(int)
installs['kind_view'] = (installs['kind'].astype(str).str.lower().str.contains("view") == True).astype(int)
installs['kind_sign'] = (installs['kind'].astype(str).str.lower().str.contains("sign|registration|account") == True).astype(int)
installs['kind_login'] = (installs['kind'].astype(str).str.lower().str.contains("login|begin") == True).astype(int)
installs['kind_launch'] = (installs['kind'].astype(str).str.lower().str.contains("launch") == True).astype(int)
installs['kind_install'] = (installs['kind'].astype(str).str.lower().str.contains("install") == True).astype(int)
installs['kind_purchase'] = (installs['kind'].astype(str).str.lower().str.contains("purchase|buy") == True).astype(int)
installs['kind_add'] = (installs['kind'].astype(str).str.lower().str.contains("add") == True).astype(int)

In [10]:
# user_agent features

installs['agent_zaful'] = (installs['user_agent'].astype(str).str.lower().str.contains("zaful") == True).astype(int)
installs['agent_cornershop'] = (installs['user_agent'].astype(str).str.lower().str.contains("cornershop") == True).astype(int)
installs['agent_binomo'] = (installs['user_agent'].astype(str).str.lower().str.contains("binomo") == True).astype(int)
installs['agent_postmates'] = (installs['user_agent'].astype(str).str.lower().str.contains("postmates") == True).astype(int)

installs['agent_tiktok'] = (installs['user_agent'].astype(str).str.lower().str.contains("tiktok") == True).astype(int)
installs['agent_sindelantal'] = (installs['user_agent'].astype(str).str.lower().str.contains("sindelantal") == True).astype(int)
installs['agent_gardenscapes'] = (installs['user_agent'].astype(str).str.lower().str.contains("gardenscapes") == True).astype(int)
installs['agent_dalvik'] = (installs['user_agent'].astype(str).str.lower().str.contains("dalvik") == True).astype(int)
installs['agent_wish'] = (installs['user_agent'].astype(str).str.lower().str.contains("wish") == True).astype(int)
installs['agent_trivago'] = (installs['user_agent'].astype(str).str.lower().str.contains("trivago") == True).astype(int)
installs['agent_grability'] = (installs['user_agent'].astype(str).str.lower().str.contains("grability") == True).astype(int)
installs['agent_mercadopago'] = (installs['user_agent'].astype(str).str.lower().str.contains("mercadopago") == True).astype(int)

In [11]:
# installs['user_agent'].loc[~installs['user_agent'].astype(str).str.lower() \
#              .str.contains("mercadopago|tiktok|sindelantal|gardenscapes|dalvik|wish|trivago|grability") == True].value_counts()

In [11]:
app_popular_list = installs['application_id'].value_counts().head(10).index

In [12]:
print(app_popular_list)

Int64Index([121, 36, 210, 14, 65, 68, 122, 155, 226, 21], dtype='int64')


In [13]:
# enumero las 10 aplicaciones mas populares

values_dict = {'121': 10, '36': 9, '210': 8, '14': 7, '65':6, '68': 5, '122':4, '155':3, '226':2, '21': 1}

installs['popular_apps'] = installs['application_id'].map(values_dict)
installs['popular_apps'].fillna(0, inplace=True)

In [14]:
installs = pd.get_dummies(installs, columns=['popular_apps'])

In [15]:
installs = installs.drop(columns=['kind', 'user_agent', 'trans_id'])

# **Ventanas para train y test**

In [16]:
time_start_1 = '2019-04-18'
time_start_2 = '2019-04-19'
time_start_3 = '2019-04-20'
time_start_4 = '2019-04-21'
time_start_5 = '2019-04-22'

In [17]:
# Ventanas para train y testing

window_1 = installs.loc[installs['created'].between('2019-04-18', '2019-04-21')]
window_2 = installs.loc[installs['created'].between('2019-04-19', '2019-04-22')]
window_3 = installs.loc[installs['created'].between('2019-04-20', '2019-04-23')]
window_4 = installs.loc[installs['created'].between('2019-04-21', '2019-04-24')]
window_5 = installs.loc[installs['created'].between('2019-04-22', '2019-04-25')]

In [23]:
# Contar la cantidad de apariciones que tiene un usuario por ventada

window_1.loc[:,'count_installs'] = window_1.groupby('ref_hash')['ref_hash'].transform('count')
window_2.loc[:,'count_installs'] = window_2.groupby('ref_hash')['ref_hash'].transform('count')
window_3.loc[:,'count_installs'] = window_3.groupby('ref_hash')['ref_hash'].transform('count')
window_4.loc[:,'count_installs'] = window_4.groupby('ref_hash')['ref_hash'].transform('count')
window_5.loc[:,'count_installs'] = window_5.groupby('ref_hash')['ref_hash'].transform('count')

In [25]:
# primera y ultima instalacion por ref_hash

window_1.loc[:,'last_install'] = window_1.groupby('ref_hash')['created'].transform('max')
window_1.loc[:,'first_install'] = window_1.groupby('ref_hash')['created'].transform('min')

window_2.loc[:,'last_install'] = window_2.groupby('ref_hash')['created'].transform('max')
window_2.loc[:,'first_install'] = window_2.groupby('ref_hash')['created'].transform('min')

window_3.loc[:,'last_install'] = window_3.groupby('ref_hash')['created'].transform('max')
window_3.loc[:,'first_install'] = window_3.groupby('ref_hash')['created'].transform('min')

window_4.loc[:,'last_install'] = window_4.groupby('ref_hash')['created'].transform('max')
window_4.loc[:,'first_install'] = window_4.groupby('ref_hash')['created'].transform('min')

window_5.loc[:,'last_install'] = window_5.groupby('ref_hash')['created'].transform('max')
window_5.loc[:,'first_install'] = window_5.groupby('ref_hash')['created'].transform('min')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [26]:
# diferencia de tiempo entre la primera y la ultima instalacion

window_1.loc[:,'diff_between_installs'] = (window_1['last_install'] - window_1['first_install']).dt.total_seconds()

window_2.loc[:,'diff_between_installs'] = (window_2['last_install'] - window_2['first_install']).dt.total_seconds()

window_3.loc[:,'diff_first_installs'] = (window_3['last_install'] - window_3['first_install']).dt.total_seconds()

window_4.loc[:,'diff_first_installs'] = (window_4['last_install'] - window_4['first_install']).dt.total_seconds()

window_5.loc[:,'diff_first_installs'] = (window_5['last_install'] - window_5['first_install']).dt.total_seconds()

In [28]:
# Promedio de diferencias de tiempo entre todas las instalaciones
window_1.loc[:,'prom_time_install'] = 0
window_1.loc[window_1['count_installs'] > 1, 'prom_time_install'] = ((window_1['last_install'] - window_1['first_install'])/ \
                                 (window_1['count_installs'] -1)).dt.total_seconds()

window_2.loc[:,'prom_time_install'] = 0
window_2.loc[window_2['count_installs'] > 1,'prom_time_install'] = ((window_2['last_install'] - window_2['first_install'])/ \
                                 (window_2['count_installs'] - 1)).dt.total_seconds()

window_3.loc[:,'prom_time_install'] = 0
window_3.loc[window_3['count_installs'] > 1,'prom_time_install'] = ((window_3['last_install'] - window_3['first_install'])/ \
                                 (window_3['count_installs'] - 1)).dt.total_seconds()

window_4.loc[:,'prom_time_install'] = 0
window_4.loc[window_4['count_installs'] > 1, 'prom_time_install'] = ((window_4['last_install'] - window_4['first_install'])/ \
                                 (window_4['count_installs'] - 1)).dt.total_seconds()

window_5.loc[:,'prom_time_install'] = 0
window_5.loc[window_5['count_installs'] > 1, 'prom_time_install'] = ((window_5['last_install'] - window_5['first_install'])/ \
                                 (window_5['count_installs'] -1 )).dt.total_seconds()

In [29]:
# diferencia de tiempo entre el comienzo de la ventana y la primera y ultima instalacion

window_1.loc[:,'time_first_install'] = (window_1['first_install'] - pd.Timestamp(time_start_1)).dt.total_seconds()
window_1.loc[:,'time_last_install'] = (window_1['last_install'] - pd.Timestamp(time_start_1)).dt.total_seconds()

window_2.loc[:,'time_first_install'] = (window_2['first_install'] - pd.Timestamp(time_start_2)).dt.total_seconds()
window_2.loc[:,'time_last_install'] = (window_2['last_install'] - pd.Timestamp(time_start_2)).dt.total_seconds()

window_3.loc[:,'time_first_install'] = (window_3['first_install'] - pd.Timestamp(time_start_3)).dt.total_seconds()
window_3['time_last_install'] = (window_3['last_install'] - pd.Timestamp(time_start_3)).dt.total_seconds()

window_4.loc[:,'time_first_install'] = (window_4['first_install'] - pd.Timestamp(time_start_4)).dt.total_seconds()
window_4.loc[:,'time_last_install'] = (window_4['last_install'] - pd.Timestamp(time_start_4)).dt.total_seconds()

window_5.loc[:,'time_first_install'] = (window_5['first_install'] - pd.Timestamp(time_start_5)).dt.total_seconds()
window_5.loc[:,'time_last_install'] = (window_5['last_install'] - pd.Timestamp(time_start_5)).dt.total_seconds()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [30]:

window_1 = window_1.sort_values('created')
window_2 = window_2.sort_values('created')
window_3 = window_3.sort_values('created')
window_4 = window_4.sort_values('created')
window_5 = window_5.sort_values('created')

In [31]:
window_1 = window_1.groupby('ref_hash').head(1)
window_2 = window_2.groupby('ref_hash').head(1)
window_3 = window_3.groupby('ref_hash').head(1)
window_4 = window_4.groupby('ref_hash').head(1)
window_5 = window_5.groupby('ref_hash').head(1)

In [32]:
print(len(window_1))
print(len(window_2))
print(len(window_3))
print(len(window_4))
print(len(window_5))

132297
129651
131181
133278
132834


In [33]:
window_1 = window_1.drop(columns=['created', 'last_install', 'first_install'])
window_2 = window_2.drop(columns=['created', 'last_install', 'first_install'])
window_3 = window_3.drop(columns=['created', 'last_install', 'first_install'])
window_4 = window_4.drop(columns=['created', 'last_install', 'first_install'])
window_5 = window_5.drop(columns=['created', 'last_install', 'first_install'])

In [34]:
window_1.head()

Unnamed: 0,application_id,ref_hash,attributed,implicit,wifi,ref_type_189,ref_type_149,weekday_0,weekday_1,weekday_2,...,agent_wish,agent_trivago,agent_grability,agent_mercadopago,popular_apps_0.0,count_installs,diff_between_installs,prom_time_install,time_first_install,time_last_install
94738,70,4432995619177048534,0,0,-1.0,1,0,0,0,0,...,0,0,0,0,1,1,0.0,0.0,1.56,1.56
94737,70,5904733559638204455,0,0,-1.0,1,0,0,0,0,...,0,0,0,0,1,1,0.0,0.0,1.851,1.851
111877,65,896373747754111825,0,1,0.0,1,0,0,0,0,...,0,0,0,0,1,2,4.006,4.006,5.152,9.158
76146,27,3399210824535017892,0,0,1.0,0,1,0,0,0,...,0,1,0,0,1,1,0.0,0.0,5.589,5.589
435184,339,1541425881979513687,0,0,1.0,0,1,0,0,0,...,0,0,0,0,1,1,0.0,0.0,6.795,6.795


### to_csv

In [35]:
window_1.to_csv(path_or_buf='windows_installs/window_1.csv',index=False)
window_2.to_csv(path_or_buf='windows_installs/window_2.csv',index=False)
window_3.to_csv(path_or_buf='windows_installs/window_3.csv',index=False)
window_4.to_csv(path_or_buf='windows_installs/window_4.csv',index=False)
window_5.to_csv(path_or_buf='windows_installs/window_5.csv',index=False)