# Análisis de Datos

El dataset Diginetica posee los siguientes atributos:
    
sessionid: ID de la sesión.
userid: ID del usuario (El paper se prueba con anonymous session-based recommendations, todos los id's son NA) 
itemid: ID del item.
timeframe: tiempo que el usuario permanecio en el item.
eventdate: fecha del evento (cuando se interactuo).

In [17]:
import pandas as pd
import numpy as np

In [18]:
# Load the data
views = pd.read_csv("./src/datasets/diginetica/train-item-views.csv", sep=";")
categories = pd.read_csv("./src/datasets/diginetica/product-categories.csv", sep=";")


In [19]:
# --- Quick look ---
print("Views dataset:")
print(views.head(), "\n")
print("Categories dataset:")
print(categories.head(), "\n")


Views dataset:
   sessionId  userId  itemId  timeframe   eventdate
0          1     NaN   81766     526309  2016-05-09
1          1     NaN   31331    1031018  2016-05-09
2          1     NaN   32118     243569  2016-05-09
3          1     NaN    9654      75848  2016-05-09
4          1     NaN   32627    1112408  2016-05-09 

Categories dataset:
   itemId  categoryId
0  139578        1096
1  417975        1096
2  291805        1096
3  396921        1096
4  159257        1096 



In [20]:
# --- Basic info ---
print("Views info:")
views.info()
print("\nCategories info:")
categories.info()

Views info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235380 entries, 0 to 1235379
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   sessionId  1235380 non-null  int64  
 1   userId     372991 non-null   float64
 2   itemId     1235380 non-null  int64  
 3   timeframe  1235380 non-null  int64  
 4   eventdate  1235380 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 47.1+ MB

Categories info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184047 entries, 0 to 184046
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   itemId      184047 non-null  int64
 1   categoryId  184047 non-null  int64
dtypes: int64(2)
memory usage: 2.8 MB


In [21]:
# --- Descriptive statistics ---
print("\nViews describe:")
print(views.describe(include="all"))

print("\nCategories describe:")
print(categories.describe(include="all"))


Views describe:


           sessionId         userId        itemId     timeframe   eventdate
count   1.235380e+06  372991.000000  1.235380e+06  1.235380e+06     1235380
unique           NaN            NaN           NaN           NaN         152
top              NaN            NaN           NaN           NaN  2016-05-30
freq             NaN            NaN           NaN           NaN       17320
mean    2.392015e+05   86963.792668  8.569671e+04  3.465888e+05         NaN
std     1.608633e+05   65592.218564  1.064063e+05  3.175156e+05         NaN
min     1.000000e+00       2.000000  1.000000e+00  1.200000e+01         NaN
25%     1.015720e+05   30832.000000  1.397800e+04  8.172950e+04         NaN
50%     2.124630e+05   71553.000000  4.189400e+04  2.417915e+05         NaN
75%     3.603910e+05  132402.000000  1.122100e+05  5.398880e+05         NaN
max     6.006870e+05  249347.000000  7.338480e+05  1.199992e+06         NaN

Categories describe:
              itemId     categoryId
count  184047.000000  184047.0

In [22]:
# --- Unique counts ---
n_sessions = views["sessionId"].nunique()
n_items = views["itemId"].nunique()
n_users = views["userId"].nunique()
avg_session_len = views.groupby("sessionId")["itemId"].count().mean()
print(f"\n# Sessions: {n_sessions}")
print(f"# Items: {n_items}")
print(f"# Users: {n_users} (many will be NA/anonymized)")
print(f"Avg. session length: {avg_session_len:.2f}")


# Sessions: 310324
# Items: 122993
# Users: 87934 (many will be NA/anonymized)
Avg. session length: 3.98


In [23]:
# --- Event date range ---
print("\nEvent date range:")
print(views["eventdate"].min(), "→", views["eventdate"].max())


Event date range:
2016-01-01 → 2016-06-01


In [24]:
# --- Categories ---
n_categories = categories["categoryId"].nunique()
items_with_category = categories["itemId"].nunique()

print(f"\n# Categories: {n_categories}")
print(f"# Items with category info: {items_with_category} "
      f"({items_with_category / n_items:.2%} of total items)")


# Categories: 1217
# Items with category info: 184047 (149.64% of total items)


In [25]:
# --- Most frequent categories ---
print("\nTop 10 categories by item count:")
print(categories["categoryId"].value_counts().head(10))



Top 10 categories by item count:
categoryId
807    3851
842    3627
368    2363
634    2245
822    2121
371    2093
88     1706
684    1670
47     1649
402    1379
Name: count, dtype: int64


## MovieLens 100K

In [None]:
!pip install pandas numpy recommenders

Collecting recommenders
  Downloading recommenders-1.2.1-py3-none-any.whl.metadata (13 kB)
Collecting category-encoders<3,>=2.6.0 (from recommenders)
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting cornac<3,>=1.15.2 (from recommenders)
  Downloading cornac-2.3.4-cp312-cp312-manylinux1_x86_64.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 kB[0m [31m972.6 kB/s[0m eta [36m0:00:00[0m
Collecting locust<3,>=2.12.2 (from recommenders)
  Downloading locust-2.41.3-py3-none-any.whl.metadata (9.6 kB)
Collecting memory-profiler<1,>=0.61.0 (from recommenders)
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Collecting retrying<2,>=1.3.4 (from recommenders)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Collecting scikit-surprise>=1.1.3 (from recommenders)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4

In [None]:
from recommenders.datasets import movielens
import pandas as pd

In [None]:
df = movielens.load_pandas_df(
    size="100k",
    # genres_col="genre",
    # header=["userID", "itemID", "rating"]
)

print(df.shape)
df.sample(5, random_state=42)

100%|██████████| 4.81k/4.81k [00:00<00:00, 21.8kKB/s]


(100000, 4)


Unnamed: 0,userID,itemID,rating,timestamp
75721,877,381,4.0,882677345
80184,815,602,3.0,878694269
19864,94,431,4.0,891721716
76699,416,875,2.0,876696938
92991,500,182,2.0,883873556


In [None]:
df.describe()

Unnamed: 0,userID,itemID,rating,timestamp,datetime
count,100000.0,100000.0,100000.0,100000.0,100000
mean,462.48475,425.53013,3.52986,883528900.0,1997-12-31 00:40:51.488619904
min,1.0,1.0,1.0,874724700.0,1997-09-20 03:05:10
25%,254.0,175.0,3.0,879448700.0,1997-11-13 19:18:29.500000
50%,447.0,322.0,4.0,882826900.0,1997-12-22 21:42:24
75%,682.0,631.0,4.0,888260000.0,1998-02-23 18:53:04
max,943.0,1682.0,5.0,893286600.0,1998-04-22 23:10:38
std,266.61442,330.798356,1.125674,5343856.0,


In [None]:
# Number of users, items, ratings
n_users = df["userID"].nunique()
n_items = df["itemID"].nunique()
n_ratings = len(df)

print(f"Users: {n_users}, Items: {n_items}, Ratings: {n_ratings}")

# Density (sparsity of rating matrix)
density = n_ratings / (n_users * n_items)
print(f"Density: {density:.5f}")

# Average ratings per user/item
print("Avg ratings per user:", df.groupby("userID")["rating"].count().mean())
print("Avg ratings per item:", df.groupby("itemID")["rating"].count().mean())

# Rating distribution
print("\nRating distribution:")
print(df["rating"].value_counts(normalize=True).sort_index())

# Time coverage
df["datetime"] = pd.to_datetime(df["timestamp"], unit="s")
print("\nTime range:", df["datetime"].min(), "→", df["datetime"].max())


Users: 943, Items: 1682, Ratings: 100000
Density: 0.06305
Avg ratings per user: 106.04453870625663
Avg ratings per item: 59.45303210463734

Rating distribution:
rating
1.0    0.06110
2.0    0.11370
3.0    0.27145
4.0    0.34174
5.0    0.21201
Name: proportion, dtype: float64

Time range: 1997-09-20 03:05:10 → 1998-04-22 23:10:38


# Amazon Reviews 2023 Digital Music

In [None]:
import pandas as pd
import json

# Read reviews line by line into a list of dicts
reviews = []
with open("Digital_Music.jsonl", "r") as f:
    for line in f:
        try:
            reviews.append(json.loads(line))
        except json.JSONDecodeError:
            # Skip malformed lines
            continue

reviews = pd.DataFrame(reviews)
print("Reviews shape:", reviews.shape)
print(reviews.head())


Reviews shape: (130434, 10)
   rating                             title  \
0     5.0                              Nice   
1     5.0                         Excellent   
2     5.0                     Great service   
3     1.0                           No good   
4     3.0  Cool concept, so-so execution...   

                                                text images        asin  \
0  If i had a dollar for how many times I have pl...     []  B004RQ2IRG   
1  awesome sound - cant wait to see them in perso...     []  B0026UZEI0   
2  This is a great cd. Good music and plays well....     []  B0055JSYHC   
3  These are not real German singers, they have a...     []  B000F9SMUQ   
4  I first heard this playing in a Nagoya shop an...     []  B0049D1WVK   

  parent_asin                       user_id      timestamp  helpful_vote  \
0  B004RQ2IRG  AFUOYIZBU3MTBOLYKOJE5Z35MBDA  1618972613292             0   
1  B0026UZEI0  AHGAOIZVODNHYMNCBV4DECZH42UQ  1308167525000             0   
2  B0055JS

In [None]:
import pandas as pd

# Users, items, reviews
n_users = reviews["user_id"].nunique()
n_items = reviews["asin"].nunique()
n_reviews = len(reviews)

print(f"Users: {n_users}")
print(f"Items: {n_items}")
print(f"Reviews: {n_reviews}")

# Average activity
print("Avg. reviews per user:", reviews.groupby("user_id")["rating"].count().mean())
print("Avg. reviews per item:", reviews.groupby("asin")["rating"].count().mean())

# Rating distribution
print("\nRating distribution:")
print(reviews["rating"].value_counts(normalize=True).sort_index())

# Time coverage
reviews["reviewTime"] = pd.to_datetime(reviews["timestamp"], unit="ms")
print("\nTime range:", reviews["reviewTime"].min(), "→", reviews["reviewTime"].max())


Users: 100952
Items: 70519
Reviews: 130434
Avg. reviews per user: 1.2920397812821935
Avg. reviews per item: 1.8496291779520413

Rating distribution:
rating
1.0    0.047043
2.0    0.024219
3.0    0.049006
4.0    0.108323
5.0    0.771409
Name: proportion, dtype: float64

Time range: 1997-09-18 04:38:56 → 2023-09-06 22:41:39.945000


In [None]:
reviews.describe()

Unnamed: 0,rating,timestamp,helpful_vote,reviewTime
count,130434.0,130434.0,130434.0,130434
mean,4.532837,1469880000000.0,1.04505,2016-07-30 11:56:35.961944064
min,1.0,874557500000.0,0.0,1997-09-18 04:38:56
25%,5.0,1400349000000.0,0.0,2014-05-17 17:49:37.249999872
50%,5.0,1475705000000.0,0.0,2016-10-05 21:58:05
75%,5.0,1573689000000.0,1.0,2019-11-13 23:42:09.836499968
max,5.0,1694040000000.0,259.0,2023-09-06 22:41:39.945000
std,1.027994,136973000000.0,3.485248,


In [None]:
meta = []
with open("meta_Digital_Music.jsonl", "r") as f:
    for line in f:
        try:
            meta.append(json.loads(line))
        except json.JSONDecodeError:
            continue

meta = pd.DataFrame(meta)
print("Meta shape:", meta.shape)
print(meta.head())


Meta shape: (70537, 14)
   main_category                                              title  \
0  Digital Music                                  Baja Marimba Band   
1  Digital Music   '80s Halloween-All Original Artists & Recordings   
2  Digital Music                                            TRIO +1   
3  Digital Music  Gold and Silver: Lehar, Delibes, Lanner, Johan...   
4  Digital Music  Grateful Dead Dave's Picks Volume 25 Live at B...   

   average_rating  rating_number features  \
0             4.9              8       []   
1             5.0              3       []   
2             5.0              1       []   
3             5.0              1       []   
4             4.9             20       []   

                            description   price  \
0                                    []     NaN   
1                                    []   14.98   
2                            [CD ALBUM]   57.99   
3                                    []   29.91   
4  [Sold out. Numbered 

In [None]:
print("Metadata shape:", meta.shape)
print("Metadata columns:", meta.columns.tolist())

Metadata shape: (70537, 15)
Metadata columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'categories_str']


In [None]:
def clean_category(x):
    if isinstance(x, list) and len(x) > 0:
        # If it's a list of lists with single chars, join them
        if isinstance(x[0], list):
            return " > ".join(["".join(part) for part in x])
        # If it's already strings, just join
        return " > ".join(x)
    return None

meta["categories_str"] = meta["categories"].apply(clean_category)

print("Top 10 categories after cleaning:")
print(meta["categories_str"].value_counts().head(10))


Top 10 categories after cleaning:
categories_str
Digital Music > Music By Price > $5.00 to $5.99          2
Digital Music > Music By Price > $8.00 to $8.99          2
Digital Music > International Music > Far East & Asia    1
Digital Music > Country                                  1
Digital Music > Rock                                     1
Name: count, dtype: int64


In [None]:
meta.describe()

Unnamed: 0,average_rating,rating_number,price
count,70537.0,70537.0,40125.0
mean,4.552297,151.024895,40.199933
std,0.695553,1555.166987,63.837289
min,1.0,1.0,0.01
25%,4.4,1.0,12.99
50%,4.8,3.0,23.9
75%,5.0,15.0,42.85
max,5.0,131983.0,2200.0
