## Data Collection and Preprocessing

- 수집으로부터 전처리, Tagging, Topic Modeling 등의 과정을 전체적으로 테스트하는 코드입니다.

In [None]:
import os
import sys
import re
import json
import time

import pandas as pd
import praw

import google.generativeai as genai # Google Gemini API

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from datetime import datetime
from openai import OpenAI # OpenAI API

In [95]:
# environment setting

client_id = "kCbCdr1ZyxVAu9INglewcA" # reddit API
client_secret = "N3Xzl-vLnjN-53RggEkT9zUMTJ7gxw" # reddit API
user_agent = "mr_xuan_" # reddit API

# API 설정
reddit = praw.Reddit(
    client_id = client_id,
    client_secret = client_secret,
    user_agent = user_agent
)

### Data Collection

In [96]:
# 데이터 수집 설정
target_subreddit = "consoles"
search_keyword = "nintendo switch"
post_limit = 1
review_lst = []

for submission in reddit.subreddit(target_subreddit).search(search_keyword, limit=post_limit):
    # 게시글 정보 수집
    
    submission.comments.replace_more(limit=None)

    # 제한 없이 모든 댓글 수집
    for comment in submission.comments.list():
        comment_info = {
            "Type": "Comment",
            "ID": comment.id,
            "Parent_ID": comment.parent_id,
            # "Author": str(comment.author),
            "Author" : comment.author.name if comment.author else "[deleted]",
            "User_ID" : getattr(comment, "author_fullname", "N/A"),
            "Body": comment.body,
            "Score": comment.score,
            "Date": datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
            "URL": f"https://www.reddit.com{comment.permalink}"
        }
        review_lst.append(comment_info)

df = pd.DataFrame(review_lst)

# df.to_excel('./reddit_reviews.xlsx')

### NLP preprocessing

In [97]:
def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # URL 제거
    text = re.sub(r'u/\S+', '', text) # 유저 태그 제거
    text = text.replace('\n', ' ') # 줄바꿈 제거
    text = re.sub(r'\s+', ' ', text).strip() # 다중 공백 제거
    return text

def nlp_preprocess(df):
    # 기본 필터링
    df = df[df['Author'] != 'AutoModerator']
    df = df[~df['Body'].isin(['[deleted]', '[removed]'])]
    
    # "mass deleted" 포함
    df = df[~df['Body'].astype(str).str.contains("mass deleted", case = False)]

    df['cleaned_Body'] = df['Body'].apply(clean_text)

    # 너무 짧을 경우 삭제
    df['word_count'] = df['cleaned_Body'].apply(lambda x: len(str(x).split()))
    df = df[df['word_count'] >= 4]

    return df

In [98]:
df = nlp_preprocess(df)

# df.to_excel('check_reddit_reviews.xlsx', index = False)

### LLM-Tagging

In [None]:
# 기본 세팅

GOOGLE_API_KEY = "API_KEY" # Google Generative AI API Key
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel(
    'gemini-2.5-flash-lite',
    generation_config={
        'response_mime_type' : 'application/json'
        }
)

OPENAI_API_KEY = "API_KEY" # OpenAI API Key
client = OpenAI(api_key = OPENAI_API_KEY)

In [100]:
def get_gemini_tags(review_text):
    prompt = f"""
        You are an expert User Researcher for a new handheld gaming console.
        Analyze the following user review to build a "User Profile" for clustering.

        Review: "{review_text}"

        Task:
        1. **Noise Filter**: If the review is vague, meaningless (e.g., "Wizardry man", "Just wow"), or irrelevant, set "is_valid" to false.
        2. **Extraction & Inference**: Based on the review, infer the user's characteristics.
            - **Usage Context**: Where/How do they use it? (e.g., Commuting, Bed, Docked mode, Travel, General Gaming). If unknown, "Unknown".
            - **User Persona**: What kind of gamer are they? (e.g., "Hardcore/Tech-savvy" if they talk about specs/FPS, "Casual" if they talk about fun/comfort, "Value-oriented" if about price).
            - **Key Priority**: What is the ONE thing they value most? (e.g., Performance, Portability, Battery, Price, Graphics).
            - **Attributes**: Specific technical specs mentioned (e.g., "OLED Screen", "Fan noise").

        Output format (JSON):
        {{
            "is_valid": boolean,
            "usage_context": "String (e.g., Commuting)",
            "user_persona": "String (e.g., Hardcore Gamer)",
            "key_priority": "String (e.g., Frame Rate)",
            "attributes": ["List", "of", "specs"],
            "sentiment": "Positive/Negative/Neutral"
        }}
    """

    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error : {e}")
        return None

def get_openai_tags(review_text):
    # 시스템 프롬프트: 역할과 출력 형식을 강력하게 정의
    system_instruction = """
        You are an expert User Researcher for a new handheld gaming console.
        Your task is to analyze user reviews to build a "User Profile" for clustering.
        
        Output must be a valid JSON object with the following keys:
        1. "is_valid": boolean. Set to false if the review is vague (e.g., "Wizardry man", "lol"), meaningless, or spam.
        2. "usage_context": string. Where/How is it used? (e.g., "Commuting", "Bed", "Docked", "Travel"). If unknown, use "Unknown".
        3. "user_persona": string. Infer the gamer type (e.g., "Hardcore/Tech-savvy", "Casual", "Value-oriented", "Parent").
        4. "key_priority": string. The single most important value for this user (e.g., "Performance", "Portability", "Battery", "Price").
        5. "attributes": list of strings. Specific technical specs mentioned.
        6. "sentiment": string. "Positive", "Negative", or "Neutral".
    """

    user_prompt = f"""
        Review: "{review_text}"""
    
    try:
        response = client.chat.completions.create(
            model = "gpt-4o-mini",
            messages = [
                {"role" : "system", "content" : system_instruction},
                {"role" : "user", "content" : user_prompt}
            ],
            temperature = 0,
            response_format = {'type' : 'json_object'}
        )

        return response.choices[0].message.content
    except Exception as e:
        print(f"Error : {e}")
        return None

In [101]:
# 실행 코드

print("Enhanced Tagging 시작... (사용자 성향 분석 중)")
results = []

for index, row in df.head(30).iterrows():
    json_str = get_openai_tags(row['cleaned_Body'])
    if json_str:
        try:
            data = json.loads(json_str)
            results.append(data)
        except json.JSONDecodeError:
            results.append({"is_valid" : False, "error" : "JSON Parse Error"})
    else:
        results.append({"is_valid" : False, "error" : "No Response"})
    
    time.sleep(1)

df_tag = pd.DataFrame(results)
df_tagged = pd.concat([df.reset_index(drop=True), df_tag], axis=1)

df_tagged.to_excel('reddit_reviews_tagged.xlsx', index = False)

Enhanced Tagging 시작... (사용자 성향 분석 중)


In [90]:
# 실행 코드

print("Enhanced Tagging 시작... (사용자 성향 분석 중)")
results = []

for index, row in df.iterrows():
    if index == 30: break # 테스트용으로 30개만 실행

    if index % 5 == 0:
        print(f"Preprocessing row {index}...")
        time.sleep(4)
    
    json_str = get_gemini_tags(row['cleaned_Body'])

    if json_str:
        try:
            data = json.loads(json_str)
            results.append(data)
        except json.JSONDecodeError:
            results.append({"is_valid": False, "error" : "JSON Error"})
    else:
        results.append({"is_valid": False, "error" : "No Response"})

df_tag = pd.DataFrame(results)
df_tagged = pd.concat([df.reset_index(drop=True), df_tag], axis=1)

df_tagged.to_excel('reddit_reviews_tagged.xlsx', index = False)

Enhanced Tagging 시작... (사용자 성향 분석 중)
Preprocessing row 0...
Error : 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10, model: gemini-2.5-flash-lite
Please retry in 46.8812928s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash-lite"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, retry_delay {
  seconds: 46
}
]
Error : 429 You exce