In [None]:
import pandas as pd
import json
from collections import Counter

# File paths
train_file = '/kaggle/input/otto-recommender-system/train.jsonl'
test_file = '/kaggle/input/otto-recommender-system/test.jsonl'
output_file = '/kaggle/working/submission.csv'

# Step 1: Count frequencies in the train dataset
clicks_count = Counter()
carts_count = Counter()
orders_count = Counter()

print("Reading and processing train.jsonl...")
with open(train_file, 'r') as file:
    for line in file:
        data = json.loads(line)
        for event in data['events']:
            aid = event['aid']
            event_type = event['type']
            if event_type == 'clicks':
                clicks_count[aid] += 1
            elif event_type == 'carts':
                carts_count[aid] += 1
            elif event_type == 'orders':
                orders_count[aid] += 1

# Get top 20 items for each type
top_20_clicks = [item[0] for item in clicks_count.most_common(20)]
top_20_carts = [item[0] for item in carts_count.most_common(20)]
top_20_orders = [item[0] for item in orders_count.most_common(20)]

print("Top 20 items for clicks:", top_20_clicks)
print("Top 20 items for carts:", top_20_carts)
print("Top 20 items for orders:", top_20_orders)

# Step 2: Create predictions based on test.jsonl
predictions = []
print("Reading and processing test.jsonl...")
with open(test_file, 'r') as file:
    for line in file:
        data = json.loads(line)
        session_id = data['session']

        # Create predictions for each session
        predictions.append(f"{session_id}_clicks," + " ".join(map(str, top_20_clicks[:5])))
        predictions.append(f"{session_id}_carts," + " ".join(map(str, top_20_carts[:5])))
        predictions.append(f"{session_id}_orders," + " ".join(map(str, top_20_orders[:5])))

# Step 3: Write predictions to submission.csv
print(f"Writing predictions to {output_file}...")
with open(output_file, 'w') as f:
    f.write("session_type,labels\n")
    for line in predictions:
        f.write(line + "\n")

# Step 4: Validate submission file
submission = pd.read_csv(output_file)
expected_rows = len(predictions)
actual_rows = submission.shape[0]

if actual_rows != expected_rows:
    print(f"Error: Submission file has {actual_rows} rows, but {expected_rows} rows are expected.")
else:
    print(f"Submission file is valid with {actual_rows} rows.")

print("Sample rows from submission:")
print(submission.head())


K·∫øt qu·∫£: 0.00252 (5)  - 0.00702(20)

In [None]:
import pandas as pd
import json
from collections import Counter
import os

# B∆∞·ªõc 1: ƒê·ªçc d·ªØ li·ªáu t·ª´ file train.jsonl
train_file = '/kaggle/input/otto-recommender-system/train.jsonl'
test_file = '/kaggle/input/otto-recommender-system/test.jsonl'

# Kh·ªüi t·∫°o Counter ƒë·ªÉ ƒë·∫øm t·∫ßn su·∫•t cho t·ª´ng lo·∫°i h√†nh ƒë·ªông
clicks_count = Counter()
carts_count = Counter()
orders_count = Counter()

# ƒê·ªçc file train.jsonl v√† ƒë·∫øm t·∫ßn su·∫•t s·∫£n ph·∫©m cho t·ª´ng lo·∫°i h√†nh ƒë·ªông
print("ƒêang ƒë·ªçc v√† x·ª≠ l√Ω file train.jsonl...")
with open(train_file, 'r') as file:
    for line in file:
        data = json.loads(line)
        events = data['events']
        for event in events:
            aid = event['aid']
            event_type = event['type']
            if event_type == 'clicks':
                clicks_count[aid] += 1
            elif event_type == 'carts':
                carts_count[aid] += 1
            elif event_type == 'orders':
                orders_count[aid] += 1

# B∆∞·ªõc 2: L·∫•y top 20 s·∫£n ph·∫©m ph·ªï bi·∫øn nh·∫•t cho m·ªói lo·∫°i h√†nh ƒë·ªông
top_20_clicks = [item[0] for item in clicks_count.most_common(20)]
top_20_carts = [item[0] for item in carts_count.most_common(20)]
top_20_orders = [item[0] for item in orders_count.most_common(20)]

print("Top 20 s·∫£n ph·∫©m cho clicks:", top_20_clicks)
print("Top 20 s·∫£n ph·∫©m cho carts:", top_20_carts)
print("Top 20 s·∫£n ph·∫©m cho orders:", top_20_orders)

# B∆∞·ªõc 3: ƒê·ªçc d·ªØ li·ªáu t·ª´ file test.jsonl v√† d·ª± ƒëo√°n
predictions = []
print("ƒêang ƒë·ªçc v√† x·ª≠ l√Ω file test.jsonl...")

with open(test_file, 'r') as file:
    for line in file:
        data = json.loads(line)
        session_id = data['session']
        
        # T·∫°o d·ª± ƒëo√°n cho t·ª´ng lo·∫°i h√†nh ƒë·ªông
        predictions.append(f"{session_id}_clicks," + " ".join(map(str, top_20_clicks)))
        predictions.append(f"{session_id}_carts," + " ".join(map(str, top_20_carts)))
        predictions.append(f"{session_id}_orders," + " ".join(map(str, top_20_orders)))

# B∆∞·ªõc 4: Ghi k·∫øt qu·∫£ ra file submission.csv
output_file = '/kaggle/working/submission.csv'
print(f"ƒêang ghi k·∫øt qu·∫£ ra file {output_file}...")

with open(output_file, 'w') as f:
    f.write("session_type,labels\n")
    for line in predictions:
        f.write(line + "\n")

print("D·ª± ƒëo√°n ho√†n t·∫•t! File submission.csv ƒë√£ ƒë∆∞·ª£c t·∫°o.")

# Ki·ªÉm tra file submission
submission = pd.read_csv(output_file)
print(submission.head())
print("File submission.csv ƒë√£ s·∫µn s√†ng ƒë·ªÉ n·ªôp!")


ƒêang ƒë·ªçc v√† x·ª≠ l√Ω file train.jsonl...
Top 20 s·∫£n ph·∫©m cho clicks: [1460571, 108125, 29735, 485256, 1733943, 184976, 832192, 1502122, 554660, 1603001, 986164, 166037, 322370, 1236775, 231487, 959208, 332654, 1196256, 95488, 620545]
Top 20 s·∫£n ph·∫©m cho carts: [485256, 152547, 33343, 166037, 1733943, 231487, 29735, 1022566, 832192, 544144, 554660, 322370, 1562705, 986164, 1083665, 332654, 1629608, 1236775, 756588, 613493]
Top 20 s·∫£n ph·∫©m cho orders: [231487, 166037, 1733943, 1445562, 1022566, 801774, 1629608, 756588, 332654, 1603001, 409620, 1257293, 1125638, 986164, 1083665, 450505, 544144, 1025795, 125278, 29735]
ƒêang ƒë·ªçc v√† x·ª≠ l√Ω file test.jsonl...


ƒêang ƒë·ªçc v√† x·ª≠ l√Ω file train.jsonl...
Top 20 s·∫£n ph·∫©m cho clicks: [1460571, 108125, 29735, 485256, 1733943, 184976, 832192, 1502122, 554660, 1603001, 986164, 166037, 322370, 1236775, 231487, 959208, 332654, 1196256, 95488, 620545]
Top 20 s·∫£n ph·∫©m cho carts: [485256, 152547, 33343, 166037, 1733943, 231487, 29735, 1022566, 832192, 544144, 554660, 322370, 1562705, 986164, 1083665, 332654, 1629608, 1236775, 756588, 613493]
Top 20 s·∫£n ph·∫©m cho orders: [231487, 166037, 1733943, 1445562, 1022566, 801774, 1629608, 756588, 332654, 1603001, 409620, 1257293, 1125638, 986164, 1083665, 450505, 544144, 1025795, 125278, 29735]

### Nh·∫≠n x√©t

### Gi·∫£i th√≠ch c√°ch tri·ªÉn khai (d√†nh cho h·ªçc sinh c·∫•p 2):

Ch∆∞∆°ng tr√¨nh n√†y l√†m vi·ªác v·ªõi **h·ªá th·ªëng g·ª£i √Ω s·∫£n ph·∫©m tr·ª±c tuy·∫øn** v√† th·ª±c hi·ªán c√°c b∆∞·ªõc sau:

---

### **1. ƒê·ªçc d·ªØ li·ªáu:**
- **D·ªØ li·ªáu ƒë·∫ßu v√†o** n·∫±m trong hai t·ªáp:
  - `train.jsonl`: D√πng ƒë·ªÉ h·ªçc c√°ch g·ª£i √Ω s·∫£n ph·∫©m t·ª´ l·ªãch s·ª≠ ho·∫°t ƒë·ªông.
  - `test.jsonl`: D√πng ƒë·ªÉ d·ª± ƒëo√°n s·∫£n ph·∫©m ng∆∞·ªùi d√πng quan t√¢m.
- M·ªói d√≤ng trong t·ªáp l√† m·ªôt ƒëo·∫°n d·ªØ li·ªáu ch·ª©a:
  - **Session (phi√™n):** Ng∆∞·ªùi d√πng l∆∞·ªõt web trong m·ªôt kho·∫£ng th·ªùi gian.
  - **Events (s·ª± ki·ªán):** C√°c h√†nh ƒë·ªông c·ªßa ng∆∞·ªùi d√πng, g·ªìm:
    - **clicks**: Nh·∫•n v√†o s·∫£n ph·∫©m.
    - **carts**: Th√™m s·∫£n ph·∫©m v√†o gi·ªè h√†ng.
    - **orders**: ƒê·∫∑t mua s·∫£n ph·∫©m.

---

### **2. ƒê·∫øm t·∫ßn su·∫•t s·∫£n ph·∫©m:**
- M·ªói s·∫£n ph·∫©m ƒë∆∞·ª£c ƒë√°nh d·∫•u b·∫±ng m·ªôt m√£ s·ªë (**aid**).
- Ch∆∞∆°ng tr√¨nh d√πng **Counter** (c√¥ng c·ª• ƒë·∫øm):
  - ƒê·ªÉ xem s·∫£n ph·∫©m n√†o ƒë∆∞·ª£c **click**, **th√™m v√†o gi·ªè h√†ng**, ho·∫∑c **ƒë·∫∑t h√†ng** nhi·ªÅu nh·∫•t.
- **M·ª•c ti√™u:** T√¨m ra **top 20 s·∫£n ph·∫©m ph·ªï bi·∫øn nh·∫•t** cho t·ª´ng lo·∫°i h√†nh ƒë·ªông.

---

### **3. D·ª± ƒëo√°n s·∫£n ph·∫©m:**
- V·ªõi m·ªói **phi√™n** trong `test.jsonl`:
  - Ch∆∞∆°ng tr√¨nh s·∫Ω d·ª± ƒëo√°n:
    - 20 s·∫£n ph·∫©m ƒë∆∞·ª£c click nhi·ªÅu nh·∫•t.
    - 20 s·∫£n ph·∫©m ƒë∆∞·ª£c th√™m v√†o gi·ªè h√†ng nhi·ªÅu nh·∫•t.
    - 20 s·∫£n ph·∫©m ƒë∆∞·ª£c ƒë·∫∑t h√†ng nhi·ªÅu nh·∫•t.
- **L√Ω do ch·ªçn top 20:** D·ª± ƒëo√°n r·∫±ng ng∆∞·ªùi d√πng s·∫Ω quan t√¢m nhi·ªÅu h∆°n ƒë·∫øn nh·ªØng s·∫£n ph·∫©m ph·ªï bi·∫øn.

---

### **4. Ghi k·∫øt qu·∫£:**
- **Output:** File `submission.csv` ch·ª©a c√°c d·ª± ƒëo√°n.
- C·∫•u tr√∫c file:
  - **session_type:** T√™n phi√™n + h√†nh ƒë·ªông (clicks, carts, orders).
  - **labels:** Danh s√°ch 20 s·∫£n ph·∫©m ƒë∆∞·ª£c d·ª± ƒëo√°n.

---

### **L√Ω do ch∆∞∆°ng tr√¨nh ho·∫°t ƒë·ªông t·ªët:**
- **ƒê∆°n gi·∫£n nh∆∞ng hi·ªáu qu·∫£:** D·ª±a tr√™n d·ªØ li·ªáu l·ªãch s·ª≠ ƒë·ªÉ ch·ªçn s·∫£n ph·∫©m ph·ªï bi·∫øn.
- **Nhanh ch√≥ng:** Ch·ªâ c·∫ßn ƒë·∫øm v√† s·∫Øp x·∫øp.
- **Ph√π h·ª£p cho b√†i to√°n l·ªõn:** V√¨ kh√¥ng c·∫ßn t√≠nh to√°n ph·ª©c t·∫°p.

---

### **V√≠ d·ª• minh h·ªça:**

1. **D·ªØ li·ªáu m·∫´u trong `train.jsonl`:**
   ```json
   {"session": "A", "events": [{"aid": 101, "type": "clicks"}, {"aid": 102, "type": "carts"}]}
   {"session": "B", "events": [{"aid": 103, "type": "clicks"}, {"aid": 101, "type": "orders"}]}
   ```

2. **K·∫øt qu·∫£ ƒë·∫øm t·∫ßn su·∫•t:**
   - **clicks:** {101: 1, 103: 1}
   - **carts:** {102: 1}
   - **orders:** {101: 1}

3. **D·ª± ƒëo√°n m·∫´u cho `test.jsonl`:**
   ```json
   {"session": "C", "events": []}
   ```

   **D·ª± ƒëo√°n:**
   ```
   C_clicks,101 103
   C_carts,102
   C_orders,101
   ```

4. **File output (`submission.csv`):**
   ```csv
   session_type,labels
   C_clicks,101 103
   C_carts,102
   C_orders,101
   ```

---

Ch∆∞∆°ng tr√¨nh n√†y gi·ªëng nh∆∞ vi·ªác ƒëo√°n m√≥n ƒÉn ph·ªï bi·∫øn nh·∫•t ·ªü cƒÉn tin: **xem ai th√≠ch m√≥n g√¨ nh·∫•t** ƒë·ªÉ ph·ª•c v·ª• ƒë√∫ng nhu c·∫ßu! üéì