source: https://medium.com/@lucafiaschi/turning-customers-feedback-into-action-an-llm-blueprint-for-app-review-analysis-7f5d39d08f6e

In [1]:
import json
import pandas as pd
import numpy as np

from joblib import Parallel, delayed
import datetime
import time

import openai
import os
from google import genai

from yaml_helper import YamlParser

In [136]:
df = pd.read_csv("data/headhway_review_rating.csv")

In [140]:
4-5: 10114
1-3: 3967

SyntaxError: illegal target for annotation (4240712040.py, line 1)

In [138]:
df.loc[df.rating < 4]

Unnamed: 0,rating,content
0,1,this app is the worst! please avoid it at any ...
1,1,i was charged my membership fee 3 times for on...
2,1,i thought this was an app that would let me si...
3,1,this is a terrible company. they lure you in ...
4,1,content is very superficial .. and then the ty...
...,...,...
14069,1,Poor books.after a few nooksstarts yo think al...
14070,1,Took $120 out after i canceled before the tria...
14071,2,"Pretty sure the ""writers"" are AI. Lots of non-..."
14079,1,Too complicated and too expensive. You have to...


In [3]:
df.loc[df.rating < 4].to_csv("data/headhway_review_rating_sample.csv", index=False)

In [4]:
df = df.loc[df.rating < 4].reset_index(drop=True)

In [103]:
df = pd.read_csv("data/bad_df.csv")

In [104]:
creds = YamlParser("creds/openai.yml").read()
OPENAI_API_KEY = creds["OPENAI_API_KEY"]

GEMINI_API_KEY = YamlParser("creds/gemini.yml").read()["GEMINI_KEY"]

In [105]:
try:
    with open("prompts/message_sentiments_prompt.md", 'r', encoding='utf-8') as file:
        MARKDOWN_PROMPT = file.read()
        # Now the 'markdown_content' variable holds the entire content of the file.
        # You can now print it, process it, etc.
        # For example:
        # print(markdown_content)
except FileNotFoundError:
    print("Error: The file could not be found.")
except Exception as e:
    print(f"An error occurred: {e}")

print(MARKDOWN_PROMPT)

## Classification Instructions  
You are a review classification expert. Given a customer review, analyze it according to the following steps:  
      
### Instructions Details  
    1. Content Screening:  
      - First, check if the review contains any content that should be flagged (inappropriate_content, hate_speech, spam, threat, private_information)  
      - If flagged content is found, note this in the metadata but continue with classification  
    2. Primary Classification:
      - For each relevant review, identify a specific category and subcategory that applies based on the hierarchy table posted below.  
      - Multiple classifications are not allowed. If the review touches multiple areas please select just the category that is most relevant.
    3. Evaluate Sentiment:  
      - Determine the sentiment of the review (positive, negative, neutral, mixed)  
      - If the review is negative, provide a list of key points, keywords
        - keywords must be from list of keyw

In [106]:
def retry_on_error(max_retries=5, sleep_time=5):
    """
    Handy decorator to fight the crashes in the API
    """

    def decorator_retry(func):
        def wrapper(*args, **kwargs):
            retries = 0
            while retries < max_retries:
                try:
                    result = func(*args, **kwargs)
                    return result
                except Exception as e:
                    print(f"An error occurred: {e}")
                    retries += 1
                    time.sleep(sleep_time)
            return None

        return wrapper

    return decorator_retry

In [107]:
# --- Setup Logging ---
import logging

date_string = datetime.datetime.now().date().isoformat()
log_file_path = f"logs/{date_string}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file_path),
        # logging.StreamHandler() # Also print to console
    ]
)

In [108]:
openai.api_key = OPENAI_API_KEY
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# client = openai.OpenAI(
#     api_key=GEMINI_API_KEY,
#     base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
# )
@retry_on_error()
def classify_topic(text, id, number=1, sleep_time=0.001):

    prompt = MARKDOWN_PROMPT%(text)
    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    result = json.loads(response.choices[0].message.content.strip())
    result['content'] = text
    # time.sleep(sleep_time)
    logging.info(f"{number} - {result['content']}")
    return result

In [109]:
a = classify_topic(text="bad app")

In [127]:

@retry_on_error(1,1)
def classify_topic(text, review_id, number=1, sleep_time=0.001):
    client = genai.Client(api_key=GEMINI_API_KEY)
    prompt = MARKDOWN_PROMPT%(text)
    response = client.models.generate_content(
        model="gemini-2.5-flash-lite-preview-06-17", contents=prompt
        ).text

    result = {"response": response}
    result['content'] = text
    result['review_id'] = review_id
    # time.sleep(sleep_time)
    logging.info(f"{number} - {result['content']}")
    return result

In [117]:
a = classify_topic(text="bad app")

An error occurred: classify_topic() missing 1 required positional argument: 'review_id'


In [118]:
def parse_json_str(text):
    return json.loads(text.removeprefix('```json\n').removesuffix('\n```'))

In [119]:
parse_json_str(a['response'])

TypeError: 'NoneType' object is not subscriptable

In [130]:
df = pd.read_csv("data/bad_df.csv")
df['sentence']

0                                   This app is the worst!
1                             Please avoid it at any cost.
2        I’ve been charged a few times without any cons...
3        The content is absolutely not worth the price ...
4        I was charged my membership fee 3 times for on...
                               ...                        
15381    The worse experience with an app ever and a bi...
15382    Some voices are more inherently enjoyable to l...
15383    Also the delivery is paramount to the depth at...
15384    I’ve found none as of yet to be nailing it in ...
15385    Not terrible by any means just not narrator le...
Name: sentence, Length: 15386, dtype: object

In [132]:
res = Parallel(n_jobs=-1, verbose=2)(delayed(classify_topic)(text=row.get('sentence'), review_id=row.get('id'), number=index) for index, row in df.iterrows())

df_result = pd.DataFrame(res)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3257 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 4877 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 5808 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 6821 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 7914 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 9089 tasks      | 

In [None]:
# res = Parallel(n_jobs=1, verbose=2)(delayed(print)("try:\n" + text) for text in df.content)

In [135]:
df_result.to_csv("data/bad_df_evalueted.csv", index=False)

In [134]:
df_result

Unnamed: 0,response,content,review_id
0,"```json\n{\n ""content_flags"": [],\n ""sen...",This app is the worst!,00000056-dae4-2802-81db-b70600000000
1,"```json\n{\n ""content_flags"": [],\n ""sen...",Please avoid it at any cost.,00000056-dae4-2802-81db-b70600000000
2,"```json\n{\n ""content_flags"": [],\n ""sen...",I’ve been charged a few times without any cons...,00000056-dae4-2802-81db-b70600000000
3,"```json\n{\n ""content_flags"": [],\n ""sentime...",The content is absolutely not worth the price ...,00000056-dae4-2802-81db-b70600000000
4,"```json\n{\n ""content_flags"": [],\n ""sen...",I was charged my membership fee 3 times for on...,00000056-dae4-2802-8211-abef00000000
...,...,...,...
15381,"```json\n{\n ""content_flags"": [],\n ""sen...",The worse experience with an app ever and a bi...,6591ed24c003e12f26118156
15382,"{\n ""content_flags"": [],\n ""sentiment"": ...",Some voices are more inherently enjoyable to l...,6591e9a91d2dade9395b4820
15383,"```json\n{\n ""content_flags"": [],\n ""sen...",Also the delivery is paramount to the depth at...,6591e9a91d2dade9395b4820
15384,"```json\n{\n ""content_flags"": [],\n ""sen...",I’ve found none as of yet to be nailing it in ...,6591e9a91d2dade9395b4820


In [None]:
{'content_flags': [], 'sentiment': 'negative', 'priority_level': 'critical', 'category': 'App Functionality & User Experience (UX/UI)', 'subcategory': 'User Interface', 'confidence_score': 0.9, 'key_points': ['The user describes the app as "bad".'], 'keywords': ['bad app']}

In [101]:
df_result['response_txt'] = df_result['response'].str.removesuffix('\n```')
df_result['response_txt'] = df_result['response_txt'].str.removeprefix('```json\n')
df_result['response_txt'] = df_result['response_txt'].apply(lambda x: safe_json_loads(x))
# df_result['sentiment'] = df_result['response_txt'].apply(lambda x: x['sentiment'])
# df_result['priority_level'] = df_result['response_txt'].apply(lambda x: x['priority_level'])
# df_result['category'] = df_result['response_txt'].apply(lambda x: x['category'])
# df_result['subcategory'] = df_result['response_txt'].apply(lambda x: x['subcategory'])
# df_result['key_points'] = df_result['response_txt'].apply(lambda x: x['key_points'])
# df_result['keywords'] = df_result['response_txt'].apply(lambda x: x['keywords'])

NameError: name 'safe_json_loads' is not defined

In [99]:
df_result

Unnamed: 0,response,content,response_txt,sentiment,priority_level,category,subcategory,key_points,keywords
0,"```json\n{\n ""content_flags"": [],\n ""sentime...",this app is the worst! please avoid it at any ...,"{\n ""content_flags"": [],\n ""sentiment"": ""neg...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
1,"```json\n{\n ""content_flags"": [],\n ""sen...",i was charged my membership fee 3 times for on...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
2,"```json\n{\n ""content_flags"": [],\n ""sen...",i thought this was an app that would let me si...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3,"```json\n{\n ""content_flags"": [],\n ""sen...",this is a terrible company. they lure you in ...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
4,"```json\n{\n ""content_flags"": [],\n ""sen...",content is very superficial .. and then the ty...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
...,...,...,...,...,...,...,...,...,...
3986,"```json\n{\n ""content_flags"": [],\n ""sen...",Poor books.after a few nooksstarts yo think al...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3987,"```json\n{\n ""content_flags"": [],\n ""sen...",Took $120 out after i canceled before the tria...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3988,"```json\n{\n ""content_flags"": [],\n ""sen...","Pretty sure the ""writers"" are AI. Lots of non-...","{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3989,"```json\n{\n ""content_flags"": [],\n ""sen...",Too complicated and too expensive. You have to...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]


In [100]:
df_result

Unnamed: 0,response,content,response_txt,sentiment,priority_level,category,subcategory,key_points,keywords
0,"```json\n{\n ""content_flags"": [],\n ""sentime...",this app is the worst! please avoid it at any ...,"{\n ""content_flags"": [],\n ""sentiment"": ""neg...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
1,"```json\n{\n ""content_flags"": [],\n ""sen...",i was charged my membership fee 3 times for on...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
2,"```json\n{\n ""content_flags"": [],\n ""sen...",i thought this was an app that would let me si...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3,"```json\n{\n ""content_flags"": [],\n ""sen...",this is a terrible company. they lure you in ...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
4,"```json\n{\n ""content_flags"": [],\n ""sen...",content is very superficial .. and then the ty...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
...,...,...,...,...,...,...,...,...,...
3986,"```json\n{\n ""content_flags"": [],\n ""sen...",Poor books.after a few nooksstarts yo think al...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3987,"```json\n{\n ""content_flags"": [],\n ""sen...",Took $120 out after i canceled before the tria...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3988,"```json\n{\n ""content_flags"": [],\n ""sen...","Pretty sure the ""writers"" are AI. Lots of non-...","{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]
3989,"```json\n{\n ""content_flags"": [],\n ""sen...",Too complicated and too expensive. You have to...,"{\n ""content_flags"": [],\n ""sentiment"": ...",negative,critical,App Functionality & User Experience (UX/UI),User Interface,"[The user describes the app as ""bad"".]",[bad app]


In [126]:
df_result

Unnamed: 0,response,content,review_id
0,"```json\n{\n ""content_flags"": [],\n ""sen...",,00000056-dae4-2802-81db-b70600000000
1,"```json\n{\n ""content_flags"": [],\n ""sen...",,00000056-dae4-2802-81db-b70600000000
2,"```json\n{\n ""content_flags"": [],\n ""sen...",,00000056-dae4-2802-81db-b70600000000
3,"```json\n{\n ""content_flags"": [],\n ""sen...",,00000056-dae4-2802-81db-b70600000000
4,"{\n ""content_flags"": [],\n ""sentiment"": ...",,00000056-dae4-2802-8211-abef00000000
...,...,...,...
15381,I cannot provide a classification for the revi...,,6591ed24c003e12f26118156
15382,"```json\n{\n ""content_flags"": [],\n ""sen...",,6591e9a91d2dade9395b4820
15383,"```json\n{\n ""content_flags"": [],\n ""sen...",,6591e9a91d2dade9395b4820
15384,"```json\n{\n ""content_flags"": [],\n ""sen...",,6591e9a91d2dade9395b4820
