In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import urllib.request
import gzip
import shutil
import json

url1 = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Video_Games.jsonl.gz"
url2 = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Video_Games.jsonl.gz"

compressed_review = "Video_Games.jsonl.gz"
compressed_metadata = "meta_Video_Games.jsonl.gz"
review = "Video_Games.jsonl"
metadata = "meta_Video_Games.jsonl"

print("파일 다운로드 중...")
urllib.request.urlretrieve(url1, compressed_review)
urllib.request.urlretrieve(url2, compressed_metadata)
print("✅ 다운로드 완료")

print()

print("압축 해제 중...")
with gzip.open(compressed_review, 'rb') as f_in:
    with open(review, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
with gzip.open(compressed_metadata, 'rb') as f_in:
    with open(metadata, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
print("✅ 압축 해제 완료")

파일 다운로드 중...
✅ 다운로드 완료

압축 해제 중...
✅ 압축 해제 완료


In [4]:
# 리뷰 데이터를 출력
with open(review, "r", encoding="utf-8") as f:
    for i in range(2):
        line = f.readline()
        if not line:
            break  # EOF
        data = json.loads(line)
        print(json.dumps(data, indent=2, ensure_ascii=False))

{
  "rating": 4.0,
  "title": "It’s pretty sexual. Not my fav",
  "text": "I’m playing on ps5 and it’s interesting.  It’s unique, massive, and has a neat story.  People are freaking out angry about this game.  I don’t think it’s a top 10 game but it’s definitely a good game on ps5 (played at launch).",
  "images": [],
  "asin": "B07DJWBYKP",
  "parent_asin": "B07DK1H3H5",
  "user_id": "AGCI7FAH4GL5FI65HYLKWTMFZ2CQ",
  "timestamp": 1608186804795,
  "helpful_vote": 0,
  "verified_purchase": true
}
{
  "rating": 5.0,
  "title": "Good. A bit slow",
  "text": "Nostalgic fun.  A bit slow.  I hope they don’t stretch it out too far.  It’s good tho",
  "images": [],
  "asin": "B00ZS80PC2",
  "parent_asin": "B07SRWRH5D",
  "user_id": "AGCI7FAH4GL5FI65HYLKWTMFZ2CQ",
  "timestamp": 1587051114941,
  "helpful_vote": 1,
  "verified_purchase": false
}


In [5]:
# 아이템의 메타데이터를 출력
with open(metadata, "r", encoding="utf-8") as f:
    for i in range(1):
        line = f.readline()
        if not line:
            break  # EOF
        data = json.loads(line)
        print(json.dumps(data, indent=2, ensure_ascii=False))

{
  "main_category": "Video Games",
  "title": "Dash 8-300 Professional Add-On",
  "average_rating": 5.0,
  "rating_number": 1,
  "features": [
    "Features Dash 8-300 and 8-Q300 ('Q' rollout livery)",
    "Airlines - US Airways, South African Express, Bahamasair, Augsburg Airways, Lufthansa Cityline, British Airways (Union Jack), British European, FlyBe, Intersky, Wideroe, Iberia, Tyrolean, QantasLink, BWIA",
    "Airports include - London City, Frankfurt, Milan and Amsterdam Schipol",
    "Includes PSS PanelConfig and LoadEdit tools"
  ],
  "description": [
    "The Dash 8-300 Professional Add-On lets you pilot a real commuter special. Fly two versions of the popular Dash 8-300 in a total of 17 different liveries. The Dash 8-300 is one of the most popular short-haul aircraft available and this superbly modelled version from acclaimed aircraft developers PSS is modelled in two versions with a total of 17 different liveries. The package also includes scenery for three European airport

In [6]:
import pandas as pd

# 리뷰가 1000개 이상 존재하는 제품의 데이터를 뽑아내서 dataframe에 저장
asin=[]
title=[]
category=[]

with open(metadata, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        if data.get("rating_number") >= 1000:
            asin.append(data.get("parent_asin"))
            title.append(data.get("title"))
            category.append(data.get("categories"))

metadata_df = pd.DataFrame({
    'asin': asin,
    'title': title,
    'category': category
})

In [7]:
metadata_df.head()

Unnamed: 0,asin,title,category
0,B07H93H878,eXtremeRate Soft Touch Top Shell Front Housing...,"[Video Games, Xbox One, Accessories, Faceplate..."
1,B07DL353F8,Tales of Vesperia - Definitive Edition - PlayS...,"[Video Games, PlayStation 4, Games]"
2,B08WLJ8WF2,eXtremeRate Replacement D-pad R1 L1 R2 L2 Trig...,"[Video Games, PlayStation 5, Accessories, Acce..."
3,B09SRXBT8Y,Nintendo Switch Sports (Nintendo Switch) (Euro...,"[Video Games, Nintendo Switch, Games]"
4,B09CG15F86,Razer Doubleshot PBT Keycap Upgrade Set for Me...,"[Video Games, PC, Accessories, Gaming Keyboards]"


In [8]:
asin_set = set(asin)

user_id=[]
asin=[]
rating=[]

with open(review, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        if data.get("parent_asin") in asin_set:
            user_id.append(data.get("user_id"))
            asin.append(data.get("parent_asin"))
            rating.append(data.get("rating"))

review_df = pd.DataFrame({
    'user_id': user_id,
    'asin': asin,
    'rating': rating
})

In [9]:
review_df.head()

Unnamed: 0,user_id,asin,rating
0,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B07DK1H3H5,4.0
1,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B07SRWRH5D,5.0
2,AFTC6ZR5IKNRDG5JCPVNVMU3XV2Q,B0BCHWZX95,5.0
3,AHXSBZT52TCPZUBVCBRICTHWUCBA,B073SC6V1D,3.0
4,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,B004RMK57U,5.0


In [13]:
user_item_counts = review_df.groupby('user_id')['asin'].nunique()

qualified_users = user_item_counts[user_item_counts >= 20].index

filtered_df = review_df[review_df['user_id'].isin(qualified_users)]

filtered_df.head()

Unnamed: 0,user_id,asin,rating
469,AFWRGOGF4AI2IHRX7KZ2IYL63RXA,B09GM4283G,5.0
470,AFWRGOGF4AI2IHRX7KZ2IYL63RXA,B0BZFWMYSQ,5.0
471,AFWRGOGF4AI2IHRX7KZ2IYL63RXA,B08GSL374K,5.0
472,AFWRGOGF4AI2IHRX7KZ2IYL63RXA,B08588JHQH,5.0
473,AFWRGOGF4AI2IHRX7KZ2IYL63RXA,B07V2BBMK4,3.0


In [14]:
filtered_df.to_csv('/content/drive/MyDrive/rec_test/review.csv', index=False)
metadata_df.to_csv('/content/drive/MyDrive/rec_test/metadata.csv', index=False)