In [1]:
import re
import pandas as pd
import numpy as np
import spacy
from rapidfuzz import process, fuzz
from nltk.stem import WordNetLemmatizer

nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()




In [None]:
#### KEYWORDS #####

### category_name : [LABELS]

categories = {
    'APP': ['threshold', 'above a rating'],
    'Ave': ['high average', 'average rating', 'average score'],
    'ADD': ['sum', 'add up'],
    'MPL': ['highest rating', 'overall highest', 'liked the most'],
    'Div': ['diversity', 'wide range', 'high variance', 'spread', 'diverse'],
    'Sim': ['similar user', 'similar rating', 'similar group', 'similar taste', 'similar item'],
    'Pop': ['liked by multiple users', 'rated highly by multiple', 'many users have given high ratings to',
                              'generally high rating', 'highly rated', 'consistently high','high ratings across','popular'],
    'LMS': ['highest of lowest values']

}

In [None]:




def preprocess_text(text):
    '''Lemmatization and lowercase of explanations to handle text'''
    doc = nlp(text.lower())  
    return " ".join([lemmatizer.lemmatize(token.text) for token in doc])

def extract_numbers(text):
    '''Number extraction'''
    return [int(num) for num in re.findall(r'\b\d+\b', text)]

def is_negated(text, keyword):
    '''explicit negation check
    Important to find instances of 'I did not average', I recommend without averaging etc.
    '''
    pattern = r'\b(not|never|without|did\snot)\s+' + re.escape(keyword) + r'\b'
    return bool(re.search(pattern, text))

def fuzzy_match(text, category_keywords, threshold=85):
    '''fuzzy match similarity '''
    for keyword in category_keywords:
        if fuzz.partial_ratio(text, keyword) >= threshold:
            return True
    return False


def categorize_explanation(text):
    '''Main function
    
    - preprocess explanation + keywords
    - search for numbers, search for negation
    - fuzzy search for keywords
    
    '''
    if pd.isna(text):  
        return "N/A"
    
    text = preprocess_text(text)  # preproc
    
    matched_categories = set()
    
    for category, keywords in categories.items():
        for keyword in keywords:
            keyword = preprocess_text(keyword) 
            
            ## check for numbers 
            if re.search(r'\b\d+\b', keyword):  
                text_nums = extract_numbers(text)
                keyword_nums = extract_numbers(keyword)
                if text_nums and keyword_nums:
                    if all(t >= k for t, k in zip(text_nums, keyword_nums)):  
                        matched_categories.add(category)
                        break
            
            
            if fuzzy_match(text, [keyword]):
                if is_negated(text, keyword):
                    break  # Skip if negated
                matched_categories.add(category)
                break  

    # exclusion rules. Some categories are defined in a contradicting manner and should not be combined. If averaging but also mentioned some other popularity keyword, it is not undefined anymore.
    if 'Pop' in matched_categories and ('Ave' in matched_categories or 'App' in matched_categories):
        matched_categories.remove('Pop')


    if 'ADD' in matched_categories and 'Ave' in matched_categories:
        matched_categories.remove('ADD')




    return ', '.join(matched_categories) if matched_categories else "Other"



In [None]:
## APPLY CATEGORIZATION FUNCTION ##

## First get all explanations
e25 = pd.read_csv('your_file_containing_explanations_25item_groups')
e50 = pd.read_csv('your_file_containing_explanations_50item_groups')
e75 = pd.read_csv('your_file_containing_explanations_75item_groups')

## add indication of item count in group
e25['Item_Group'] = '25 items'
e50['Item_Group'] = '50 items'
e75['Item_Group'] = '75 items'


# combine all data
e_comb = pd.concat([e25, e50, e75])

## long form data
e_m =  pd.melt(e_comb, 
                    id_vars=['groupId', 'Item_Group'], 
                    value_vars=['Llama', 'Mistral', 'Gemma', 'Phi'], 
                    var_name='Model', 
                    value_name='Exp')
e_m['id'] = e_m.index + 1



## APPLY ACTUAL CATEGORIZATION TO ALL EXPLANATIONS
e_m['labels'] = e_m['Exp'].apply(categorize_explanation)


# have a look
e_m.head(2)

In [None]:
### CREATE RESULTS AS SHOWN IN TABLE ####

#Explode labels to count occurrences 
e_m_exploded = e_m.assign(label=e_m['labels'].str.split(', ')).explode('label')

# Count occurrences of each label per model and item group
label_counts = e_m_exploded.groupby(['Model', 'Item_Group', 'label']).size().reset_index(name='count')

# pivot to get table as we want
label_counts_pivot = label_counts.pivot_table(index=['label', 'Item_Group'], columns='Model', values='count', fill_value=0)

# percentages 
label_counts_pivot_percent = (label_counts_pivot / 500) * 100
label_counts_pivot_percent