# Proyecto Integrado 2
---

### 1. Objetivo del proyecto 

Una empresa emergente vende productos alimenticios. Se debe investigar el comportamiento del usuario para la aplicación de la empresa.

---


### 2. Carga de dataset y procesamiento de datos 


In [1]:
# Importación de librerías neceserarias
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from plotly import graph_objects as go
import plotly.express as px
import re
from datetime import time

In [2]:
#Asignación de variable al dataset

def load_data():
    try:
        # Intentar cargar los datos de la forma local
        logs = pd.read_csv('logs_exp_us.csv', sep='\t') # Reemplaza con la ruta local real
        return logs
    except FileNotFoundError:
        # Si la carga local falla, intentar la carga desde la instancia de revisión
        try:
            logs = pd.read_csv('/datasets/logs_exp_us.csv', sep='\t')
            return logs
        except FileNotFoundError:
            print("No se pudo encontrar el archivo en ninguna de las ubicaciones.")
            return None

# Llamar a la función para cargar los datos
logs = load_data()

In [3]:
#Visualización de la información del dataset
logs.info()
logs.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244126 entries, 0 to 244125
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   EventName       244126 non-null  object
 1   DeviceIDHash    244126 non-null  int64 
 2   EventTimestamp  244126 non-null  int64 
 3   ExpId           244126 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 7.5+ MB


Unnamed: 0,EventName,DeviceIDHash,EventTimestamp,ExpId
0,MainScreenAppear,4575588528974610257,1564029816,246
1,MainScreenAppear,7416695313311560658,1564053102,246
2,PaymentScreenSuccessful,3518123091307005509,1564054127,248
3,CartScreenAppear,3518123091307005509,1564054127,248
4,PaymentScreenSuccessful,6217807653094995999,1564055322,248
5,CartScreenAppear,6217807653094995999,1564055323,248
6,OffersScreenAppear,8351860793733343758,1564066242,246
7,MainScreenAppear,5682100281902512875,1564085677,246
8,MainScreenAppear,1850981295691852772,1564086702,247
9,MainScreenAppear,5407636962369102641,1564112112,246


In [4]:
#Cambiar el nombre de las columnas 
new_names = {'EventName': 'Event_Name',
            'DeviceIDHash': 'User_ID',
            'EventTimestamp': 'Event_Timestamp',
            'ExpId': 'Exp_Id'}

# Renombrar columnas usando el diccionario 
logs = logs.rename(columns=new_names)

logs.columns

Index(['Event_Name', 'User_ID', 'Event_Timestamp', 'Exp_Id'], dtype='object')

In [5]:
#Cambio de tipo de dato de la columna 'Event_Name' a category 
logs['Event_Name'] = logs['Event_Name'].astype('category')

In [6]:
#Búsqueda de valores ausentes 
logs.isna().sum()

Event_Name         0
User_ID            0
Event_Timestamp    0
Exp_Id             0
dtype: int64

In [7]:
#Búsqueda de filas duplicadas 
logs.duplicated().sum()

np.int64(413)

In [14]:
#Eliminación de filas duplicadas
logs.drop_duplicates().reset_index(inplace= True)
logs

Unnamed: 0,Event_Name,User_ID,Event_Timestamp,Exp_Id,Date
0,MainScreenAppear,4575588528974610257,2019-07-25 04:43:36,246,2019-07-25
1,MainScreenAppear,7416695313311560658,2019-07-25 11:11:42,246,2019-07-25
2,PaymentScreenSuccessful,3518123091307005509,2019-07-25 11:28:47,248,2019-07-25
3,CartScreenAppear,3518123091307005509,2019-07-25 11:28:47,248,2019-07-25
4,PaymentScreenSuccessful,6217807653094995999,2019-07-25 11:48:42,248,2019-07-25
...,...,...,...,...,...
244121,MainScreenAppear,4599628364049201812,2019-08-07 21:12:25,247,2019-08-07
244122,MainScreenAppear,5849806612437486590,2019-08-07 21:13:59,246,2019-08-07
244123,MainScreenAppear,5746969938801999050,2019-08-07 21:14:43,246,2019-08-07
244124,MainScreenAppear,5746969938801999050,2019-08-07 21:14:58,246,2019-08-07


In [9]:
#Agregar columna con fecha 
logs['Event_Timestamp'] = pd.to_datetime(logs['Event_Timestamp'], unit= 's')
logs['Date'] = logs['Event_Timestamp'].values.astype(dtype='datetime64[D]')

logs.head()


Unnamed: 0,Event_Name,User_ID,Event_Timestamp,Exp_Id,Date
0,MainScreenAppear,4575588528974610257,2019-07-25 04:43:36,246,2019-07-25
1,MainScreenAppear,7416695313311560658,2019-07-25 11:11:42,246,2019-07-25
2,PaymentScreenSuccessful,3518123091307005509,2019-07-25 11:28:47,248,2019-07-25
3,CartScreenAppear,3518123091307005509,2019-07-25 11:28:47,248,2019-07-25
4,PaymentScreenSuccessful,6217807653094995999,2019-07-25 11:48:42,248,2019-07-25


---
### 3. Estudiar y comprobar los datos

In [21]:
#Número de eventos que hay 
event_number= logs.groupby('Event_Name')['User_ID'].count()
event_number

  event_number= logs.groupby('Event_Name')['User_ID'].count()


Event_Name
CartScreenAppear            42731
MainScreenAppear           119205
OffersScreenAppear          46825
PaymentScreenSuccessful     34313
Tutorial                     1052
Name: User_ID, dtype: int64

In [18]:
#Número de usuarios 
print('Número de usuarios:', logs['User_ID'].nunique())

Número de usuarios: 7551


In [20]:
#Promedio de eventos por usuario
logs.groupby('Event_Name').agg({'User_ID': 'nunique'}).mean()

  logs.groupby('Event_Name').agg({'User_ID': 'nunique'}).mean()


User_ID    4039.0
dtype: float64