In [1]:

from langchain.chains import RetrievalQA
from langchain.llms import Ollama
from langchain_ollama import ChatOllama
import random
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
import ast
import os
from enum import Enum
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import collections as c
import pandas as pd
import re

import ast

random.seed(123456) 
np.random.seed(1)


In [None]:
## Loading datasets

####################
total = 2000
num_members = [4]


num_items = [75]  #### [either 25, 50 or 75]
##################
## Datasets are named after num_members, num_items and total groups


#file = f'group_data/v2-groups_{num_members}members_{num_items}items_totalgroups{total}.csv'
file = f'v3-groups_{num_members}members_{num_items}items_totalgroups{total}.csv'

df = pd.read_csv(file)
df.head(2) ## example users

Unnamed: 0,user_id,item_18154,item_27769,item_6453,item_47239,item_6557,item_13201,item_79785,item_67012,item_16798,...,item_9390,item_21333,item_11624,item_59952,item_13193,item_25125,item_55990,item_90364,item_4403,groupId
0,user_4856,23,1,85,10,7,35,4,15,30,...,52,32,2,78,66,56,16,78,67,5001
1,user_33653,65,80,66,89,93,15,95,51,36,...,18,89,53,16,38,35,43,32,41,5001


In [4]:
## All helper functions + aggregation strategy implementations

def transform_df(df):
    df_long = df.melt(id_vars=['groupId', 'user_id'], var_name='item', value_name='rating')
    return df_long

def listmaker(value):
    if isinstance(value, str):
        try:
            value = [value]
        except (ValueError, SyntaxError):
            print(f"ERROR MAKING A LIST?!")
            return []  
    return value if isinstance(value, list) else []  


def ADD(df):
    counts = df.groupby(['groupId', 'item'])['rating'].sum().reset_index(name='sum_rating')
    return list(counts.sort_values(by='sum_rating', ascending=False)['item'].head(10))
    

def APP(df, threshold=60):
    above_threshold = df[df['rating'] > threshold]
    counts = above_threshold.groupby(['groupId', 'item']).size().reset_index(name='count_above_threshold')
    
    all_items = df['item'].unique()
    all_groups = df['groupId'].unique()
    full_index = pd.MultiIndex.from_product([all_groups, all_items], names=['groupId', 'item'])
    
    counts = counts.set_index(['groupId', 'item']).reindex(full_index, fill_value=0).reset_index()
    
    return list(counts.sort_values(by='count_above_threshold', ascending=False)['item'].head(10))

def LMS(df):
    counts = df.groupby(['groupId', 'item'])['rating'].min().reset_index(name='min_rating')
    return list(counts.sort_values(by='min_rating', ascending=False)['item'].head(10))

def MPL(df):
    counts = df.groupby(['groupId', 'item'])['rating'].max().reset_index(name='max_rating')
    
    return list(counts.sort_values(by='max_rating', ascending=False)['item'].head(10))



from numpy import dot
from numpy.linalg import norm


def cosine_sim(vector1, vector2):
    return float(dot(vector1, vector2)/(norm(vector1)*norm(vector2)))        

def clean_model_list(val):
    
    if isinstance(val, list):
        cleaned_list = []
        for item in val:
            cleaned_list.extend(clean_model_list(item))  
        return cleaned_list
    
    if isinstance(val, str):
        if val.startswith('["') and val.endswith('"]'):
            return [val[2:-2]]  
        
        try:
            evaluated = ast.literal_eval(val)
            if isinstance(evaluated, list):
                return clean_model_list(evaluated)  
        except (SyntaxError, ValueError):
            pass  
    
        if ',' in val:
            return [item.strip() for item in val.split(',')]
    
    return [str(val)]


def ensure_list(value):
    if isinstance(value, str):
        try:
            value = ast.literal_eval(value)  
        except (ValueError, SyntaxError):
            print(f"Warning: Failed to convert {value} to list")
            return []  
    return value if isinstance(value, list) else []  


def extract_numbers(items):
    return [int(re.search(r'(\d+)$', item).group(1)) for item in items if re.search(r'(\d+)$', item)]

In [None]:
##### LLMS ######


## Load LLMs themselves
llm_llama = ChatOllama(model='llama3.1:8b-instruct-q8_0', temperature=0.1, max_tokens=1000,seed=1234) 
llm_mistral = ChatOllama(model='mistral-nemo:12b-instruct-2407-q4_K_M', temperature=0.1, max_tokens=1000,seed=1234) 
llm_gemma = ChatOllama(model='gemma3:12b', temperature=0.1, max_tokens=1000,seed=1234) 
llm_phi = ChatOllama(model='phi4', temperature=0.1, max_tokens=1000,seed=1234) 

## Parser to double check JSON ##
class RecommendationExpl(BaseModel):
    recommendation: list = Field(description="python list of the final group recommendations")
    explanation: str = Field(description='your explanation of your recommendation procedure')

parser = JsonOutputParser(pydantic_object=RecommendationExpl)


### Prompt itself #####
prompt = PromptTemplate(
    template="""

    You are an expert in making and explaining group recommendations based on the knowledge base provided below. You do not write python code.
    You explain the process behind making the recommendation to the group in such a way that someone without recommender systems knowledge can understand. Come up with a simple way to explain to the group how you came up with your recommendations.
    That information includes users (user_id) and information on items they like (item_x). 
    The rating is a scale from 0 to 100. When referring to items, use item_value. The userId itself is just to refer to a user. For the recommendation, you simply mention the item.

    The per-item ratings are presente below: \n
    ## begin ratings ##
    {desc}
    ## end ratings ##
    
    You make a recommendation to this group of users by providing 10 items based on your recommendation approach. 
    Your recommendation contains exactly 10 items and is formatted as a python list containing strings. Refer to items using their name (item_value)
    
    Provide your answer strictly as a JSON object with the following format:
{{
  "recommendation": ["item","item","item","item","item","item","item","item","item","item"],
  "explanation": "explanation and example of your recommendation procedure"
}}
Think about the answer internally, but only output the final JSON object. Do not include any additional text or python code. 

    """ ,
    input_variables=["desc"]
)


#### LLM chains -> prompt | model | json parser

chain_llama = prompt | llm_llama | parser
chain_mistral = prompt | llm_mistral | parser
chain_gemma = prompt | llm_gemma | parser
chain_phi = prompt | llm_phi| parser

In [None]:
########## LLM LOOP ##########



result_file = "your_file_name_for_recommendations"
expl_file = "your_file_name_for_explanations"

result_exists = os.path.isfile(result_file)
expl_exists = os.path.isfile(expl_file)


### if we do not start from scratch, continue where we previously left off
if result_exists == True:
    done = pd.read_csv(result_file)
    group= done['groupId'].max() +1
    max_counter = df['groupId'].max()

## Otherwise start at the beginning
else:
    group = df['groupId'].min()
    max_counter = df['groupId'].max()


print(group)

while group <= max_counter:
    # Locate group
    df_members = df.loc[df['groupId'] == group]

    try:
        
        ## generate responses from each LLM using the group scenario. Columns -1 because the groupId is not necessary to present to LLM
        responses = {
            "llama": chain_llama.invoke({"desc": df_members.iloc[:, :-1].to_dict(orient='list')}),
            "mistral": chain_mistral.invoke({"desc": df_members.iloc[:, :-1].to_dict(orient='list')}),
            "gemma": chain_gemma.invoke({"desc": df_members.iloc[:, :-1].to_dict(orient='list')}),
            "phi": chain_phi.invoke({"desc": df_members.iloc[:, :-1].to_dict(orient='list')}),
        }

        ### extract top-10s and explanations from output
        vectors = {key: extract_numbers(ensure_list(res["recommendation"])) for key, res in responses.items()}
        explanations = {key: res["explanation"] for key, res in responses.items()}

        # generate top-10s using each social choice-based aggregation strategy
        vectors["ADD"] = extract_numbers(ADD(transform_df(df_members)))
        vectors["MPL"] = extract_numbers(MPL(transform_df(df_members)))
        vectors["LMS"] = extract_numbers(LMS(transform_df(df_members)))
        vectors["APP"] = extract_numbers(APP(transform_df(df_members)))

        ## Creating random top-10 for the random baseline
        item_columns = [col for col in df_members.columns if col not in ["user_id", "groupId"]]
        vectors['random'] = extract_numbers(random.sample(item_columns, min(10, len(item_columns))))
        vector_lengths = [len(v) for v in vectors.values()]

        ## creating temporary dataframe to add to the previous iterations
        if len(set(vector_lengths)) == 1:

            # df with top-10s
            df_temp = pd.DataFrame([{
                "groupId": group,
                "group_size": df_members["user_id"].nunique(),
                **{f"vector_{key}": vec for key, vec in vectors.items()} 
            }])
            # df with explanations
            df_e = pd.DataFrame([{
                "groupId": group,
                **{key.capitalize(): exp for key, exp in explanations.items()}  
            }])
            ## save new data to the files
            df_temp.to_csv(result_file, mode='a', index=False, header=not result_exists)
            df_e.to_csv(expl_file, mode='a', index=False, header=not expl_exists)
            
            # If it is the first run, the file now exists!
            result_exists = expl_exists = True
        else:
            group+=1
            print(vector_lengths) ### print if there are issues (one of the LLMs not returning 10 items?)
            continue

    except Exception as e:
        print(f"Error processing group {group}: {e}") ## print if other issues

    group += 1  
