# Análisis de Datos

In [17]:
import pandas as pd
import numpy as np

In [18]:
# Load the data
views = pd.read_csv("./src/datasets/diginetica/train-item-views.csv", sep=";")
categories = pd.read_csv("./src/datasets/diginetica/product-categories.csv", sep=";")


In [19]:
# --- Quick look ---
print("Views dataset:")
print(views.head(), "\n")
print("Categories dataset:")
print(categories.head(), "\n")


Views dataset:
   sessionId  userId  itemId  timeframe   eventdate
0          1     NaN   81766     526309  2016-05-09
1          1     NaN   31331    1031018  2016-05-09
2          1     NaN   32118     243569  2016-05-09
3          1     NaN    9654      75848  2016-05-09
4          1     NaN   32627    1112408  2016-05-09 

Categories dataset:
   itemId  categoryId
0  139578        1096
1  417975        1096
2  291805        1096
3  396921        1096
4  159257        1096 



In [20]:
# --- Basic info ---
print("Views info:")
views.info()
print("\nCategories info:")
categories.info()

Views info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235380 entries, 0 to 1235379
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   sessionId  1235380 non-null  int64  
 1   userId     372991 non-null   float64
 2   itemId     1235380 non-null  int64  
 3   timeframe  1235380 non-null  int64  
 4   eventdate  1235380 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 47.1+ MB

Categories info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184047 entries, 0 to 184046
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   itemId      184047 non-null  int64
 1   categoryId  184047 non-null  int64
dtypes: int64(2)
memory usage: 2.8 MB


In [21]:
# --- Descriptive statistics ---
print("\nViews describe:")
print(views.describe(include="all"))

print("\nCategories describe:")
print(categories.describe(include="all"))


Views describe:


           sessionId         userId        itemId     timeframe   eventdate
count   1.235380e+06  372991.000000  1.235380e+06  1.235380e+06     1235380
unique           NaN            NaN           NaN           NaN         152
top              NaN            NaN           NaN           NaN  2016-05-30
freq             NaN            NaN           NaN           NaN       17320
mean    2.392015e+05   86963.792668  8.569671e+04  3.465888e+05         NaN
std     1.608633e+05   65592.218564  1.064063e+05  3.175156e+05         NaN
min     1.000000e+00       2.000000  1.000000e+00  1.200000e+01         NaN
25%     1.015720e+05   30832.000000  1.397800e+04  8.172950e+04         NaN
50%     2.124630e+05   71553.000000  4.189400e+04  2.417915e+05         NaN
75%     3.603910e+05  132402.000000  1.122100e+05  5.398880e+05         NaN
max     6.006870e+05  249347.000000  7.338480e+05  1.199992e+06         NaN

Categories describe:
              itemId     categoryId
count  184047.000000  184047.0

In [22]:
# --- Unique counts ---
n_sessions = views["sessionId"].nunique()
n_items = views["itemId"].nunique()
n_users = views["userId"].nunique()
avg_session_len = views.groupby("sessionId")["itemId"].count().mean()
print(f"\n# Sessions: {n_sessions}")
print(f"# Items: {n_items}")
print(f"# Users: {n_users} (many will be NA/anonymized)")
print(f"Avg. session length: {avg_session_len:.2f}")


# Sessions: 310324
# Items: 122993
# Users: 87934 (many will be NA/anonymized)
Avg. session length: 3.98


In [23]:
# --- Event date range ---
print("\nEvent date range:")
print(views["eventdate"].min(), "→", views["eventdate"].max())


Event date range:
2016-01-01 → 2016-06-01


In [24]:
# --- Categories ---
n_categories = categories["categoryId"].nunique()
items_with_category = categories["itemId"].nunique()

print(f"\n# Categories: {n_categories}")
print(f"# Items with category info: {items_with_category} "
      f"({items_with_category / n_items:.2%} of total items)")


# Categories: 1217
# Items with category info: 184047 (149.64% of total items)


In [25]:
# --- Most frequent categories ---
print("\nTop 10 categories by item count:")
print(categories["categoryId"].value_counts().head(10))



Top 10 categories by item count:
categoryId
807    3851
842    3627
368    2363
634    2245
822    2121
371    2093
88     1706
684    1670
47     1649
402    1379
Name: count, dtype: int64
