In [1]:
import os
import sys
import re
import json
import time
import spacy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import praw

import google.generativeai as genai # Google Gemini API

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from datetime import datetime
from openai import OpenAI # OpenAI API
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation # LDA Topic Modeling

import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai # Google Gemini API


### CR_Extraction

In [13]:
# 데이터 불러오기

df = pd.read_excel('./reddit_reviews_tagged_enhanced_decompose_2.xlsx')

df.head()

Unnamed: 0,Type,Body,Date,URL,cleaned_Body,word_count,is_valid,purpose_classification,evidence_snippet,reasoning
0,Comment,Any recommendations for laptops mainly for onl...,2022-03-17 10:25:31,https://www.reddit.com/r/laptops/comments/7eon...,recommendation laptop online college course to...,7,True,['Academic/Education'],recommendation laptop online college course,The review mentions use for an 'online college...
1,Comment,I found a lenovo gaming laptop with a 3060 GPU...,2022-09-11 17:39:45,https://www.reddit.com/r/laptops/comments/7eon...,find lenovo game laptop amd ryzen cpu fpr come...,15,True,['Gaming/Hobby'],find lenovo game laptop amd ryzen cpu,The reviewer is looking for a Lenovo 'game lap...
2,Comment,Alright I have narrowed it down to TWO laptops...,2023-03-16 00:32:41,https://www.reddit.com/r/laptops/comments/7eon...,narrow laptop haha travel year want start edit...,35,True,['Professional'],want start edit photo video... able handle ado...,The reviewer explicitly mentions starting phot...
3,Comment,I am looking at a new laptop and I see ads for...,2024-08-18 21:20:12,https://www.reddit.com/r/laptops/comments/7eon...,look new laptop one run celeron n5100 processo...,26,True,['Everyday/Casual'],want simple basic task house,The reviewer explicitly mentions wanting the l...
4,Comment,My laptop turns off in like 3 minutes once I u...,2024-09-06 04:35:56,https://www.reddit.com/r/laptops/comments/7eon...,laptop turn minute unplug power supply design ...,15,True,['General/Unspecified'],laptop turn minute unplug power supply design ...,The review mentions a functional issue related...


In [14]:
# Purpose_classification [] 제거

for idx, val in enumerate(df['purpose_classification']):
    if val.startswith('[') and val.endswith(']'):
        val = val[2:-2]
        df.loc[idx, 'purpose_classification'] = val

In [18]:
df['purpose_classification'].value_counts()

purpose_classification
General/Unspecified    406
Gaming/Hobby           165
Professional           113
Everyday/Casual         91
Academic/Education      49
Name: count, dtype: int64

In [44]:
# ------------------------------------------------------------------------
# [1] 목적별 Customer Requirement 추출용 프롬프트 함수
# ------------------------------------------------------------------------

def get_cr_extraction_prompt(purpose):
    """
    특정 사용 목적에 맞춰 최적화된 요구사항 추출 프롬프트를 생성합니다.
    """
    return f"""
        ### Identity: The Voice of the "{purpose}" User Group
        You are the representative of the "{purpose}" community. You understand their lifestyle, daily challenges, and the specific performance standards they demand from a laptop. 

        ### Mission:
        Read the following review segments. Your goal is to advocate for their needs by translating their raw experiences into clear, actionable Customer Requirements (CR) that product designers can act upon.

        ### Analysis Lens:
        - As a "{purpose}" user, what is a "deal-breaker" for you in these reviews?
        - What specific pain points mentioned by others do you deeply resonate with?
        - How should a laptop be engineered to perfectly satisfy someone in your position?

        ### Guidelines:
        1. User-Centric Translation: Convert subjective complaints (e.g., "too heavy") into objective requirements (e.g., "Weight must be under 1.5kg for portability").
        2. No Redundancy: If multiple peers mention the same issue, consolidate them into one high-impact requirement.

        ### Output Schema (JSON only):
        {{
            "representative_persona": "Briefly describe your identity as a {purpose} user",
            "requirements": [
                {{
                    "requirement": "The standardized requirement from your perspective",
                    "evidence_summary": "The core sentiment from the reviews"
                }}
            ]
        }}
    """

# ------------------------------------------------------------------------
# [2] CR 추출 함수
# ------------------------------------------------------------------------

def run_cr_extraction(purpose, review_segments, client):
    """
    LLM을 호출하여 데이터에서 요구사항을 추출합니다.
    """
    system_prompt = get_cr_extraction_prompt(purpose)
    user_prompt = f"""
        Target Purpose : "{purpose}"
        Review Segments : \n{"\n".join(review_segments)}
    """
    
    try:
        response = client.chat.completions.create(
            model = "gpt-5.1",
            service_tier = "flex",
            messages = [
                {"role" : "system", "content" : system_prompt},
                {"role" : "user", "content" : user_prompt}
            ], temperature = 0,
            response_format = {"type" : "json_object"}
        )
        
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during LLM call for purpose '{purpose}': {e}")
        return None

In [None]:
# OpenAI client 설정

OPENAI_API_KEY = ""
client = OpenAI(api_key = OPENAI_API_KEY)

In [None]:
# 테스트

professional = df[df['purpose_classification'] == 'Professional']
professional_result = run_cr_extraction(
    purpose = 'Professional',
    review_segments = professional['Body'].tolist(),
    client = client
)

data = json.loads(professional_result)
pd.DataFrame(data['requirements']).to_excel('./professional_cr_extraction_results.xlsx', index = False)

In [None]:
# ------------------------------------------------------------------------
# [3] 목적별 CR 추출 실행
# ------------------------------------------------------------------------

result_df = pd.DataFrame()
for purpose in df['purpose_classification'].unique():
    if purpose == 'General/Unspecified': continue
    purpose_reviews = df[df['purpose_classification'] == purpose]
    cr_result = run_cr_extraction(
        purpose = purpose,
        review_segments = purpose_reviews['Body'].tolist(),
        client = client
    )
    
    if cr_result:
        try:
            cr_data = json.loads(cr_result)
            cr_df = pd.DataFrame(cr_data['requirements'])
            cr_df['purpose'] = purpose
            result_df = pd.concat([result_df, cr_df], ignore_index=True)
        except json.JSONDecodeError as e:
            print(f"JSON decoding error for purpose '{purpose}': {e}")
    time.sleep(2)  # API rate limit 고려

# result_df.to_excel('./all_purposes_cr_extraction_results_01.xlsx', index = False)