<a href="https://colab.research.google.com/github/CuriousGu/llm_zeroshot_calibration/blob/dev/llm/evaluating_prompts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.0 - Initial Settings

In [1]:
%%capture
!pip install openai==0.28

In [2]:
# LLM
import openai

# Data Processing
import pandas as pd
import re

# settings
from google.colab import userdata, drive
from tqdm import tqdm
import os

In [3]:
# finding folder path
def find_folder(root, target):
  for root, dirs, _ in os.walk(root):
    if target in dirs:
      return os.path.join(root, target)
  return ValueError("None\nDownload the Dataset or change the target Directory")

In [4]:
drive.mount("/content/drive")

root = "/"
target = "datasets_imdb"
PATH = find_folder(root, target)

Mounted at /content/drive


# 1.1 - Testing Prompts

I will test 4 prompts generated by ChatGPT with 15 random users. All of them will be evaluated, so the better one will be used in large scale.

## 1.1 - LLM

In [5]:
openai.api_key = userdata.get('OPENAI_API_KEY')

def request_openai(prompt, openai=openai):

  response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      temperature=0.2,
      max_tokens=300,
      top_p=0.9,
      messages=[
          {
              "role": "system",
              "content": "You are a recommender"
          },
          {
              "role": "user",
              "content": prompt
          }
      ]
  )

  return response['choices'][0]['message']['content']

## 1.2 - Reading Data

In [6]:
headers = ["userId", "movieId", "rating", "timestamp", "title"]
trainset = pd.read_csv(f"{PATH}/refined/train_llm.csv", names=headers)
testset = pd.read_csv(f"{PATH}/refined/test_llm.csv", names=headers)

In [7]:
users_test = trainset.userId.unique()[:15]
trainset = trainset[trainset.userId.isin(users_test)]

In [8]:
user_movies_dict = trainset.groupby('userId')['title'].apply(list).to_dict()

## 1.3 - Prompts

In [9]:
testing_prompts = {
  "p1": "Please suggest 15 movies or TV shows. Format the output as follows: 'production (release year)\nproduction (release year)\nproduction (release year)...'",
  "p2": "I'm looking for movies and TV shows, can you provide 15 recommendations released prior to 2015? Please list 15 productions with their release years only in the format: 'production (release year)\nproduction (release year)...'",
  "p3": "Could you recommend some classic movies and TV shows from before 2015? I need the output formatted as follows: 'movie (release year)\ntv show (release year)\nmovie (release year)...'",
  "p4": "I'm interested in discovering older content. Can you suggest 15 movies or TV shows released before 2015? Please list the productions along with their release years only, in the format: 'production (release year)\nproduction (release year)...'"
}

## 1.4 - Requesting Recs

In [26]:
llm_recs = list()

total_iterations = len(testing_prompts) * len(user_movies_dict)
progress_bar = tqdm(total=total_iterations, desc='Progress')

for prompt_id, prompt in testing_prompts.items():
  for k, v in user_movies_dict.items():
    temp_dict = dict()

    movies = "\n".join(v)
    prompt += f"\nI've watched:\n{movies}"
    response = request_openai(prompt)

    temp_dict['prompt_id'] = prompt_id
    temp_dict["response"] = response
    temp_dict["userId"] = k

    llm_recs.append(temp_dict)
    progress_bar.update(1)

progress_bar.close()

Progress: 100%|██████████| 60/60 [03:15<00:00,  3.26s/it]


In [28]:
df_recs = pd.DataFrame(llm_recs)
df_recs.tail(5)

Unnamed: 0,prompt_id,response,userId
55,p4,The recommendations are:\n1. The Shawshank Red...,2854
56,p4,The recommendations are:\n1. The Shawshank Red...,3231
57,p4,The recommendations are:\n1. The Godfather (19...,3300
58,p4,1. The Shawshank Redemption (1994)\n2. Pulp Fi...,7623
59,p4,Here are 15 movies or TV shows released before...,7886


## 1.5 - Saving Recs

In [10]:
df_recs.to_csv(f"{PATH}/refined/recs_llm/evaluating_prompts.csv", index=False)

# 2.0 - Evaluating Better Prompt

## 2.1 - Visual Analysis

In [10]:
df_recs = pd.read_csv(f"{PATH}/refined/recs_llm/evaluating_prompts.csv")

for k in testing_prompts.keys():
  p_recs = df_recs[df_recs.prompt_id == k]['response']
  print(f"\n\n{'#'*10} {k} {'#'*10}\n\n")
  for x in df_recs[df_recs.prompt_id == 'p1']['response']:
    print(x)



########## p1 ##########


The Shawshank Redemption (1994)
Friends (1994)
Pulp Fiction (1994)
The Lion King (1994)
Forrest Gump (1994)
The Matrix (1999)
The Sopranos (1999)
American Beauty (1999)
Fight Club (1999)
The Sixth Sense (1999)
The Green Mile (1999)
The Office (UK) (2001)
The Lord of the Rings: The Fellowship of the Ring (2001)
Harry Potter and the Sorcerer's Stone (2001)
Shrek (2001)
The Princess Bride (1987)
The Shawshank Redemption (1994)
Forrest Gump (1994)
Pulp Fiction (1994)
The Matrix (1999)
The Silence of the Lambs (1991)
Fight Club (1999)
The Godfather (1972)
Back to the Future (1985)
Jurassic Park (1993)
Friends (1994)
The X-Files (1993)
The Simpsons (1989)
Seinfeld (1989)
Buffy the Vampire Slayer (1997)
The Sixth Sense (1999)
The Matrix (1999)
American Beauty (1999)
Fight Club (1999)
The Green Mile (1999)
The Shawshank Redemption (1994)
Pulp Fiction (1994)
Forrest Gump (1994)
The Lion King (1994)
Jurassic Park (1993)
Friends (1994)
The X-Files (1993)
The Simpsons 

In [11]:
for index, row in df_recs.iterrows():
  movie_name, prompt = row['response'], row['prompt_id']
  print(f"#### {prompt} #### \n{movie_name}\n\n")

#### p1 #### 
The Shawshank Redemption (1994)
Friends (1994)
Pulp Fiction (1994)
The Lion King (1994)
Forrest Gump (1994)
The Matrix (1999)
The Sopranos (1999)
American Beauty (1999)
Fight Club (1999)
The Sixth Sense (1999)
The Green Mile (1999)
The Office (UK) (2001)
The Lord of the Rings: The Fellowship of the Ring (2001)
Harry Potter and the Sorcerer's Stone (2001)
Shrek (2001)


#### p1 #### 
The Princess Bride (1987)
The Shawshank Redemption (1994)
Forrest Gump (1994)
Pulp Fiction (1994)
The Matrix (1999)
The Silence of the Lambs (1991)
Fight Club (1999)
The Godfather (1972)
Back to the Future (1985)
Jurassic Park (1993)
Friends (1994)
The X-Files (1993)
The Simpsons (1989)
Seinfeld (1989)
Buffy the Vampire Slayer (1997)


#### p1 #### 
The Sixth Sense (1999)
The Matrix (1999)
American Beauty (1999)
Fight Club (1999)
The Green Mile (1999)
The Shawshank Redemption (1994)
Pulp Fiction (1994)
Forrest Gump (1994)
The Lion King (1994)
Jurassic Park (1993)
Friends (1994)
The X-Files (19

I'll disqualify p3 and p4.

- p4 output does not have a clear pattern: headers can be included or not. It'll be harder to adjust the returns.

- p3 did not respected the amount of recs required.

## 2.2 - Reading all movies names


In [12]:
movies_names = pd.read_csv(f"{PATH}/raw/movielens_20m_datasets/movie.csv")
movies_names.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## 2.3 - Functions

In [13]:
def item_is_relevant(user_id, item_id, df):
    aux = df[df["userId"] == user_id]
    if item_id in list(aux['title']):
        return True
    return False

In [14]:
def calculate_MAP(prediction_user_map, movies):

  progress_bar = tqdm(total=len(prediction_user_map), desc='Progress', position=0)
  MAP = 0

  for user_id in prediction_user_map:
    map_user, user_relevants = 0, 0
    for index, item in enumerate(prediction_user_map[user_id]):
      relevance = item_is_relevant(user_id, item, movies)
      user_relevants += int(relevance)
      map_user += user_relevants / (index + 1) if relevance else 0

    MAP += map_user / user_relevants if user_relevants != 0 else 0

    progress_bar.update(1)
  progress_bar.close()

  return round(MAP / len(prediction_user_map), 3)

In [15]:
def fix_response(response: str) -> list:
  # if it's an indexed list, I'll split and remove the integers
  if "." in response:
    movies = re.findall(r'\d+\.\s+(.+)', response)
    return movies
  else:
    pattern = r'((?:The\s)?(?:\b[A-Z][a-z]*\s?)+)\s\((\d{4})\)'
    movies = re.findall(pattern, response)
    movies = [f"{item} ({year})" for item, year in movies]

    return movies

In [16]:
# if the movie is not in IMDB dataset, the rec will be dropped
def in_dataset(user_interactions: str, all_movies):
  movies_in_dataset = [movie for movie in user_interactions if movie in list(all_movies.title)]

  return movies_in_dataset[:10]

## 2.4 - Eval prompt 1

In [17]:
df_recs_a = df_recs[df_recs["prompt_id"] == "p1"]
df_recs_a.head(5)

Unnamed: 0,prompt_id,response,userId
0,p1,The Shawshank Redemption (1994)\nFriends (1994...,810
1,p1,The Princess Bride (1987)\nThe Shawshank Redem...,897
2,p1,The Sixth Sense (1999)\nThe Matrix (1999)\nAme...,1302
3,p1,The Godfather (1972)\nFriends (1994)\nForrest ...,1738
4,p1,The Godfather (1972)\nFriends (1994)\nThe Shaw...,1932


In [18]:
prediction_user_map_a = dict()

for index, row in df_recs_a.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  movies = in_dataset(movies, movies_names)

  prediction_user_map_a[row["userId"]] = movies

In [19]:
prediction_user_map_a

{810: ['Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  'American Beauty (1999)',
  'Fight Club (1999)',
  'Shrek (2001)'],
 897: ['Forrest Gump (1994)',
  'Pulp Fiction (1994)',
  'Fight Club (1999)',
  'Jurassic Park (1993)'],
 1302: ['American Beauty (1999)',
  'Fight Club (1999)',
  'Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  'Jurassic Park (1993)'],
 1738: ['Forrest Gump (1994)',
  'Pulp Fiction (1994)',
  'Jurassic Park (1993)',
  'Titanic (1997)',
  'Braveheart (1995)',
  'Home Alone (1990)'],
 1932: ['Forrest Gump (1994)',
  'Pulp Fiction (1994)',
  'Jurassic Park (1993)',
  'Titanic (1997)'],
 1971: ['Forrest Gump (1994)',
  'Jurassic Park (1993)',
  'Home Alone (1990)',
  'Pulp Fiction (1994)'],
 2116: ['Forrest Gump (1994)',
  'Pulp Fiction (1994)',
  'Jurassic Park (1993)',
  'Goodfellas (1990)'],
 2162: ['Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  'Jurassic Park (1993)',
  'Titanic (1997)'],
 2577: ['Casablanca (1942)',
  'American Pie (1999)',
  'American Bea

In [20]:
map_p1 = calculate_MAP(prediction_user_map_a, testset)

Progress: 100%|██████████| 15/15 [00:00<00:00, 693.91it/s]


## 2.5 - Eval prompt 2

In [22]:
df_recs_b = df_recs[df_recs["prompt_id"] == "p2"]
df_recs_b.head(5)

Unnamed: 0,prompt_id,response,userId
15,p2,1. The Matrix (1999)\n2. Fight Club (1999)\n3....,810
16,p2,The Shawshank Redemption (1994)\nPulp Fiction ...,897
17,p2,The recommendations are:\n\n1. The Shawshank R...,1302
18,p2,The Sixth Sense (1999)\nThe Matrix (1999)\nFig...,1738
19,p2,The Matrix (1999)\nThe Shawshank Redemption (1...,1932


In [23]:
prediction_user_map_b = dict()

for index, row in df_recs_b.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  movies = in_dataset(movies, movies_names)

  prediction_user_map_b[row["userId"]] = movies

In [24]:
prediction_user_map_b

{810: ['Fight Club (1999)',
  'American Beauty (1999)',
  'Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  "Schindler's List (1993)",
  'Jurassic Park (1993)',
  'Terminator 2: Judgment Day (1991)',
  'Goodfellas (1990)',
  'Star Wars: Episode IV - A New Hope (1977)'],
 897: ['Pulp Fiction (1994)', 'Forrest Gump (1994)', 'Fight Club (1999)'],
 1302: ['Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  'Fight Club (1999)',
  'American Beauty (1999)',
  'Saving Private Ryan (1998)',
  'Jurassic Park (1993)',
  "Schindler's List (1993)"],
 1738: [],
 1932: ['Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  'Jurassic Park (1993)',
  'Fight Club (1999)',
  'Braveheart (1995)',
  'Titanic (1997)',
  'Goodfellas (1990)',
  'Saving Private Ryan (1998)'],
 1971: ['Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  'Goodfellas (1990)',
  'Jurassic Park (1993)',
  'Fight Club (1999)',
  'Braveheart (1995)',
  'Saving Private Ryan (1998)'],
 2116: ['Pulp Fiction (1994)',
  'Forrest Gump (1994)',
  'J

In [25]:
map_p2 = calculate_MAP(prediction_user_map_b, testset)

Progress: 100%|██████████| 15/15 [00:00<00:00, 632.23it/s]


## 2.6 -  Results

In [29]:
print(f"TEST 1\nMAP (prompt 1) = {map_p1}\nMAP (prompt 2) = {map_p2}")

TEST 1
MAP (prompt 1) = 0.397
MAP (prompt 2) = 0.525


## 2.7 - How many recs were in the dataset?

In [35]:
test_1_total_recs_count = 0
for index, row in df_recs_a.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  test_1_total_recs_count += len(movies)


test_1_total_recs_in_dataset_count = 0
for recs_in_dataset in prediction_user_map_a.values():
  test_1_total_recs_in_dataset_count += len(recs_in_dataset)

perc_1_a = round(test_1_total_recs_in_dataset_count/test_1_total_recs_count*100, 2)
print(f"PROMPT 1:\n{perc_1_a}% of the recs were in IMDB dataset")

PROMPT 1:
30.94% of the recs were in IMDB dataset


In [37]:
test_1_total_recs_count = 0
for index, row in df_recs_b.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  test_1_total_recs_count += len(movies)


test_1_total_recs_in_dataset_count = 0
for recs_in_dataset in prediction_user_map_b.values():
  test_1_total_recs_in_dataset_count += len(recs_in_dataset)

perc_1_b = round(test_1_total_recs_in_dataset_count/test_1_total_recs_count*100, 2)
print(f"PROMPT 1:\n{perc_1_b}% of the recs were in IMDB dataset")

PROMPT 1:
37.32% of the recs were in IMDB dataset


# 3.0 - Refining Prompts

Based on the p1 and p2 results, I'll do a new request informing the dataset name thru the input

## 3.1 - New requests

In [38]:
testing_prompts = {
  "p5": "Please suggest 15 movies or TV shows (based on the IMDB dataset). Format the output as follows: 'production (release year)\nproduction (release year)\nproduction (release year)...'",
  "p6": "I'm looking for movies and TV shows, can you provide 15 recommendations released prior to 2015 based on the IMDB dataset? Please list 15 productions with their release years only in the format: 'production (release year)\nproduction (release year)...'",
}

In [41]:
llm_recs = list()

total_iterations = len(testing_prompts) * len(user_movies_dict)
progress_bar = tqdm(total=total_iterations, desc='Progress')

for prompt_id, prompt in testing_prompts.items():
  for k, v in user_movies_dict.items():
    temp_dict = dict()

    movies = "\n".join(v)
    prompt += f"\nI've watched:\n{movies}"
    response = request_openai(prompt)

    temp_dict['prompt_id'] = prompt_id
    temp_dict["response"] = response
    temp_dict["userId"] = k

    llm_recs.append(temp_dict)
    progress_bar.update(1)

progress_bar.close()

Progress:  10%|█         | 3/30 [00:16<02:26,  5.44s/it]
Progress: 100%|██████████| 30/30 [01:47<00:00,  3.59s/it]


In [43]:
df_recs = pd.DataFrame(llm_recs)
df_recs.tail(5)

Unnamed: 0,prompt_id,response,userId
25,p6,The recommendations based on the IMDB dataset ...,2854
26,p6,The Shawshank Redemption (1994)\nPulp Fiction ...,3231
27,p6,The Shawshank Redemption (1994)\nPulp Fiction ...,3300
28,p6,The Shawshank Redemption (1994)\nPulp Fiction ...,7623
29,p6,The recommendations based on the IMDB dataset ...,7886


## 3.2 - Saving Recs

In [68]:
df_recs.to_csv(f"{PATH}/refined/recs_llm/evaluating_prompts_imdb_prompt.csv", index=False)

## 3.3 - Visual Analysis

In [None]:
df_recs_c = df_recs[df_recs["prompt_id"] == "p5"]
df_recs_c.head(5)

In [48]:
df_recs_c = df_recs[df_recs["prompt_id"] == "p5"]
df_recs_c.head(5)

for response in df_recs_c.response:
  print(f"\n\n{response}")



1. The Lord of the Rings: The Fellowship of the Ring (2001)
2. Harry Potter and the Sorcerer's Stone (2001)
3. Monsters, Inc. (2001)
4. Shrek (2001)
5. Spirited Away (2001)
6. Donnie Darko (2001)
7. A Beautiful Mind (2001)
8. Ocean's Eleven (2001)
9. Training Day (2001)
10. Black Hawk Down (2001)
11. The Royal Tenenbaums (2001)
12. Amélie (2001)
13. Mulholland Drive (2001)
14. The Others (2001)
15. Zoolander (2001)


Based on the IMDB dataset, here are 15 movie and TV show recommendations for you:

1. The Lord of the Rings: The Fellowship of the Ring (2001)
2. Harry Potter and the Sorcerer's Stone (2001)
3. Shrek (2001)
4. Monsters, Inc. (2001)
5. Ocean's Eleven (2001)
6. Black Hawk Down (2001)
7. Training Day (2001)
8. The Office (2001-2003) - TV Series
9. 24 (2001-2010) - TV Series
10. Alias (2001-2006) - TV Series
11. Spirited Away (2001)
12. Donnie Darko (2001)
13. Mulholland Drive (2001)
14. The Others (2001)
15. The Royal Tenenbaums (2001)


1. The Lord of the Rings: The Fellow

In [49]:
df_recs_d = df_recs[df_recs["prompt_id"] == "p6"]
df_recs_d.head(5)

for response in df_recs_d.response:
  print(f"\n\n{response}")



The recommendations based on the IMDB dataset released prior to 2015 are:

1. The Shawshank Redemption (1994)
2. The Godfather (1972)
3. The Dark Knight (2008)
4. Schindler's List (1993)
5. The Lord of the Rings: The Return of the King (2003)
6. Pulp Fiction (1994)
7. Fight Club (1999)
8. Forrest Gump (1994)
9. Inception (2010)
10. The Matrix (1999)
11. The Silence of the Lambs (1991)
12. The Green Mile (1999)
13. The Departed (2006)
14. Gladiator (2000)
15. The Prestige (2006)


The recommendations based on the IMDB dataset released prior to 2015 are:

1. The Shawshank Redemption (1994)
2. The Godfather (1972)
3. The Dark Knight (2008)
4. Pulp Fiction (1994)
5. Schindler's List (1993)
6. The Lord of the Rings: The Return of the King (2003)
7. Fight Club (1999)
8. Forrest Gump (1994)
9. The Matrix (1999)
10. Goodfellas (1990)
11. The Silence of the Lambs (1991)
12. Se7en (1995)
13. The Usual Suspects (1995)
14. The Green Mile (1999)
15. Gladiator (2000)


The recommendations based on

## 3.4 - Eval prompt 5

In [54]:
prediction_user_map_c = dict()

for index, row in df_recs_c.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  movies = in_dataset(movies, movies_names)

  prediction_user_map_c[row["userId"]] = movies

In [60]:
map_p5 = calculate_MAP(prediction_user_map_c, testset)

Progress: 100%|██████████| 15/15 [00:00<00:00, 627.02it/s]


## 3.5 - Eval prompt 6

In [57]:
prediction_user_map_d = dict()

for index, row in df_recs_d.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  movies = in_dataset(movies, movies_names)

  prediction_user_map_d[row["userId"]] = movies

In [61]:
map_p6 = calculate_MAP(prediction_user_map_d, testset)

Progress: 100%|██████████| 15/15 [00:00<00:00, 572.32it/s]


## 3.6 - Results

In [63]:
print(f"TEST 2\nMAP (prompt 5) = {map_p5}\nMAP (prompt 6) = {map_p6}")

TEST 2
MAP (prompt 5) = 0.267
MAP (prompt 6) = 0.5


## 3.7 - How many recs were in the dataset?

In [69]:
test_2_total_recs_count = 0
for index, row in df_recs_c.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  test_2_total_recs_count += len(movies)


test_2_total_recs_in_dataset_count = 0
for recs_in_dataset in prediction_user_map_c.values():
  test_2_total_recs_in_dataset_count += len(recs_in_dataset)

perc_2_c = round(test_2_total_recs_in_dataset_count/test_2_total_recs_count*100, 2)
print(f"PROMPT 5:\n{perc_2_c}% of the recs were in IMDB dataset")

PROMPT 5:
39.52% of the recs were in IMDB dataset


In [70]:
test_2_total_recs_count = 0
for index, row in df_recs_d.iterrows():
  movies = row['response']
  movies = fix_response(movies)
  test_2_total_recs_count += len(movies)


test_2_total_recs_in_dataset_count = 0
for recs_in_dataset in prediction_user_map_d.values():
  test_2_total_recs_in_dataset_count += len(recs_in_dataset)

perc_2_d = round(test_2_total_recs_in_dataset_count/test_2_total_recs_count*100, 2)
print(f"PROMPT 6:\n{perc_2_d}% of the recs were in IMDB dataset")

PROMPT 6:
39.27% of the recs were in IMDB dataset
