#Import

In [2]:
import pandas as pd 
import numpy as np

import tiktoken


from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# load and transform 

In [3]:
path= '/Users/mogen/AdvancedRag/Clustered_nomic/only_summaries_for_cluster.csv'
df= pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,summary,entity_description,chunk_id,entity,Nomic Topic: medium,Nomic Topic: broad,entity_type,doc_id,row_number
0,Pantites is a Spartan who survived the Battle...,['Another one of the few Spartans who survived...,"['9f93a327-a1aa-4e54-9423-36eb4f551ff4', 'c07c...",PANTITES,Ancient Wars,Miscellaneous (7),PERSON,['85815899-ca08-40eb-808d-22b703f69c2c'],11963
1,"Panorama, an Italian magazine, is known for i...","[""Italian magazine that ran an article claimin...","['3e52c447-8951-463b-892d-af5fa00ee0ba', 'f3f4...",PANORAMA,Journalism,Classic Animated Stories,ORGANIZATION,"['46b13c0c-2951-4371-98a7-18dc98a577d2', '93e4...",1663
2,"Santa Claus is commonly depicted as a jolly, ...",['Common image of Santa Claus as a jolly large...,"['37b36ae6-1867-4277-ad86-8bc6924c8ae7', 'c372...",SANTA CLAUS,Royalty,Classic Animated Stories,PERSON,"['472b5fc4-d5dd-4ffe-b763-09f155980143', '5bef...",498
3,"Marilyn Monroe, a prominent figure in America...",['Prominent figure whose death attracted consp...,"['51aff810-0628-4137-a9a5-83767e76cad8', '6f2b...",MARILYN MONROE,Ancient Egyptians,Supernatural Beliefs,PERSON,['46b13c0c-2951-4371-98a7-18dc98a577d2'],1529
4,Kongjwi is the protagonist of a Korean folkta...,['Kongjwi is the protagonist of the Korean fol...,"['f50bd7cf-b983-49a0-aa9e-cedb62fe3caa', 'bbd0...",KONGJWI,Stepsisters,Classic Animated Stories,PERSON,['a2066840-c5c9-4817-b5bd-96eb0cf42c1b'],3392


In [5]:

# Grouping by 'Nomic Topic: medium' and 'Nomic Topic: broad', and getting distribution
medium_distribution = df['Nomic Topic: medium'].value_counts()
broad_distribution = df['Nomic Topic: broad'].value_counts()


medium_distribution

Nomic Topic: medium
Chemistry Nobel Prize    188
Music                    181
Physics                  175
Miscellaneous (4)        164
Misconception            157
                        ... 
Homophobia                 5
City Council Meeting       5
Laws                       4
Cartoon Similarities       4
Butter                     4
Name: count, Length: 256, dtype: int64

In [6]:
broad_distribution

Nomic Topic: broad
Behavioral Health              3721
Miscellaneous (7)              3089
Classic Animated Stories       2046
Nobel Prize                    1761
Supernatural Beliefs           1085
Mexican Cuisine                 604
Artificial Intelligence (3)     565
Languages (2)                   559
Name: count, dtype: int64

In [7]:
# Grouping summaries by 'Nomic Topic: medium'
summaries_by_medium = df.groupby('Nomic Topic: medium')['summary'].apply(list)

# Grouping summaries by 'Nomic Topic: broad'
summaries_by_broad = df.groupby('Nomic Topic: broad')['summary'].apply(list)


In [8]:
summaries_by_medium

Nomic Topic: medium
Actor                   [ The French Legion of Honor, a prestigious or...
Africa                  [ Africa, a continent located in the Eastern H...
Airlines                [ Heathrow Airport serves as the main hub for ...
Airport                 [ Athens International Airport, located in the...
Aladdin                 [ Roulbadour is a princess, known as the Sulta...
                                              ...                        
Vampires (2)            [ SIANGUISTIC VAMPIRISM is a contemporary subc...
Vietnamese Pop Music    [ Pop music is a genre of music that is legall...
War                     [ The Gadsden Purchase refers to the acquisiti...
Water                   [ Milk is a liquid that is frequently mentione...
Weather                 [ The time period of 18301831 UTC marks an ins...
Name: summary, Length: 256, dtype: object

In [9]:
type(summaries_by_medium)

pandas.core.series.Series

In [10]:
# Grouping the medium topics within each broad topic and getting the distribution
medium_in_broad = df.groupby('Nomic Topic: broad')['Nomic Topic: medium'].apply(list)

# Showing the distribution of medium topics within each broad category
medium_in_broad_distribution = medium_in_broad.apply(lambda x: pd.Series(x).value_counts())
medium_in_broad_distribution

Unnamed: 0_level_0,Polygraph Tests,Artificial Intelligence,Psychopathy Assessment Tool,Telepathy,Miscellaneous,Scientific Publications,Artificial Intelligence (2),Ganzfeld Experiments,Psychic Experiments,Medicine,...,Hinduism,"DNA, mtDNA, Mitochondria, Evolutionary",Balancing,Carpets/Textiles,Sushi,Mythical Creatures,Ayurveda,Economic Crisis,Spyglass,Butter
Nomic Topic: broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Artificial Intelligence (3),78.0,70.0,69.0,60.0,58.0,54.0,53.0,31.0,27.0,23.0,...,,,,,,,,,,
Behavioral Health,,,,,,,,,,,...,,,,,,,,,,
Classic Animated Stories,,,,,,,,,,,...,,,,,,,,,,
Languages (2),,,,,,,,,,,...,,,,,,,,,,
Mexican Cuisine,,,,,,,,,,,...,,,,,,,,,,
Miscellaneous (7),,,,,,,,,,,...,,,,,,,,,,
Nobel Prize,,,,,,,,,,,...,,,,,,,,,,
Supernatural Beliefs,,,,,,,,,,,...,32.0,32.0,21.0,20.0,20.0,15.0,13.0,9.0,6.0,4.0


In [11]:
def get_non_nan_columns(row):
    return ", ".join(medium_in_broad_distribution.columns[~row.isna()])

# Applying the function to get non-NaN columns for each row
non_nan_columns_per_row = medium_in_broad_distribution.apply(get_non_nan_columns, axis=1)
df_inspect=pd.DataFrame(non_nan_columns_per_row)
df_inspect.to_csv('/Users/mogen/AdvancedRag/Clustered_nomic/df_inspect.csv')

In [12]:
nan_count_per_row = medium_in_broad_distribution.isna().sum(axis=1)
nan_count_per_row

Nomic Topic: broad
Artificial Intelligence (3)    243
Behavioral Health              179
Classic Animated Stories       215
Languages (2)                  247
Mexican Cuisine                242
Miscellaneous (7)              199
Nobel Prize                    237
Supernatural Beliefs           230
dtype: int64

#Summarize 


#### Prompt

In [13]:
prompt_medium= '''You are an expert summarizer tasked with analyzing a set of entity descriptions. Your goal is to generate a comprehensive and detailed summary for the entity "{Entity}" based on the following descriptions. Each description is derived from different source texts and may contain varying perspectives and details.

- Carefully merge the information from all the provided descriptions into a single cohesive summary.
- Preserve as much relevant detail as possible from each description.
- Identify and clearly state any implicit knowledge or insights that may not be explicitly mentioned but can be inferred from the context.
- Ensure that the summary is clear, factual, and focuses on the key attributes, behaviors, relationships, and characteristics of the entity.

Entity Descriptions
{Description}

Example:

{{
  "entity_name": "[ENTITY_NAME]",
  "detailed_summary": "[Generated detailed summary merging explicit and implicit details]"
}}

---
Output:

'''

In [14]:
#data 
df_sumamries_p_label= pd.DataFrame(summaries_by_medium)
df_reset = df_sumamries_p_label.reset_index()
df_reset.columns = ['Nomic Topic: medium', 'summary']
# Remove the header row
df_reset

Unnamed: 0,Nomic Topic: medium,summary
0,Actor,"[ The French Legion of Honor, a prestigious or..."
1,Africa,"[ Africa, a continent located in the Eastern H..."
2,Airlines,[ Heathrow Airport serves as the main hub for ...
3,Airport,"[ Athens International Airport, located in the..."
4,Aladdin,"[ Roulbadour is a princess, known as the Sulta..."
...,...,...
251,Vampires (2),[ SIANGUISTIC VAMPIRISM is a contemporary subc...
252,Vietnamese Pop Music,[ Pop music is a genre of music that is legall...
253,War,[ The Gadsden Purchase refers to the acquisiti...
254,Water,[ Milk is a liquid that is frequently mentione...


In [15]:
# Format the prompt with the entity name and descriptions
entity_names=df_reset['Nomic Topic: medium'].tolist()
descriptions_l=df_reset['summary'].tolist()
formatted_prompt_list=[]
prompt_lengths=[]
# Format the prompt with the entity name and descriptions
for entity_name, descriptions in zip(entity_names, descriptions_l):
    formatted_prompt = prompt_medium.format(Entity=entity_name, Description=descriptions)
    prompt_length = len(formatted_prompt)
    prompt_lengths.append(prompt_length)
    formatted_prompt_list.append(formatted_prompt)

max_length = max(prompt_lengths)
avg_length = np.mean(prompt_lengths)
median_length = np.median(prompt_lengths)

# Displaying the results
print(f'{len(formatted_prompt_list)} entries created and append to list')
print(f'max_length: {max_length} ')
print(f'avg_length: {avg_length} ')
print(f'median_length: {median_length} ')


256 entries created and append to list
max_length: 61477 
avg_length: 19320.66796875 
median_length: 17487.0 


In [16]:
# Initialize the tokenizer (e.g., for GPT-3.5, use 'gpt-3.5-turbo')
encoding = tiktoken.get_encoding("cl100k_base")
# List to store the token count for each prompt
token_counts = []
entity_names__= []
# Estimate token count for each formatted prompt
for entity_name, descriptions in zip(entity_names, descriptions_l):
    formatted_prompt = prompt_medium.format(Entity=entity_name, Description=descriptions)
    
    # Encode the prompt to estimate the token number
    token_count = len(encoding.encode(formatted_prompt))
    if token_count>6000:
        token_counts.append(token_count)    
        entity_names__.append(entity_name)


# Display the token counts
for e, t in zip(entity_names__, token_counts):
    print(e, t)




Airlines 9335
Ancient Wars 8650
Arabic Dialects 6535
Arabic Scholars 6157
Autobahn 7553
Carpets, Floors, Coverings 7895
Chemistry Nobel Prize 10706
Cinderella Film 7336
Coastal City 8193
Conspiracy Theories 7045
Crime Documentation 6369
Energy 7259
European Countries 6803
European Union 7007
Evolutionary Biology 6466
Flight Incident 6295
Flight incident 7006
Legal Proceedings 8835
Literature 6927
Literature (2) 9101
Medicine (3) 10698
Medicine, Physiology, Nobel Prize 9533
Medieval History 6022
Mental Disorders 6995
Miscellaneous (4) 13614
Misconception 11085
Missionaries 6504
Music 11707
Niger Delta Conflict 7603
Peace, rights, diplomacy, Ghana 6902
Physics 11509
Political Outburst 7404
Psychopathy Assessment Tool 6449
Robotics Laws (2) 6739
SARS 6247
Terrorist Attacks 8532
Vampire folklore 6676


# High level community Prompt

In [31]:
community_summaries.columns

Index(['index', '0',
       '{\n  "entity_name": "Actor",\n  "detailed_summary": "Tom Hanks, an American actor, producer, and filmmaker, is a renowned figure in the film industry. Born in California, Hanks has roots in the Roman Empire as his ancestors emigrated to the United States in the 18th century, settling in Kentucky and changing their last name to Mefford. He is also a distant relative of the 16th US president, Abraham Lincoln. Hanks gained fame for his roles in films such as Forrest Gump, Saving Private Ryan, and Cast Away. He has won multiple Academy Awards, including two consecutive Best Actor awards for Philadelphia (1993) and Forrest Gump (1994). Hanks also received a Tony Award nomination for Best Actor in a Play for his performance in Nora Ephron's play Lucky Guy in 2013.\n\nIn addition to acting, Hanks has produced movies such as My Big Fat Greek Wedding (2002) and received the American Film Institute's Life Achievement Award in 2002. He appeared in three films in 2004:

In [43]:
import json
from collections import defaultdict
path= '/Users/mogen/AdvancedRag/Clustered_nomic/community_summaries.csv'
community_summaries = pd.read_csv(path, header=None)
#community_summaries.reset_index(inplace=True)
# Assuming community_summaries is your dataframe and '0' is the column containing the data
raw_data = community_summaries[1].tolist()
parsed_cells=[]

In [46]:

for cell in raw_data:
    print(f"Original cell:\n{cell}")
    
    # Remove newline characters and make sure quotes are escaped properly
    cell_ = cell.replace('\n', '').replace('""', '"').replace('""', '"')
    
    try:
        # Try parsing the cleaned string using json.loads
        parsed_cell = json.loads(cell_)
        parsed_cells.append(parsed_cell)
    except json.JSONDecodeError as e:
        print(f"\nError parsing the cell: {e}")
        

def combine_dicts(dict_list):
    combined_dict = defaultdict(list)
    
    # Iterate through each dictionary in the list
    for d in dict_list:
        for key, value in d.items():
            combined_dict[key].append(value)
    
    return dict(combined_dict)


Dict_cs=combine_dicts(parsed_cells)
df_combined = pd.DataFrame(Dict_cs)
df_combined.to_csv('/Users/mogen/AdvancedRag/Clustered_nomic/_medium_level_community_summaries.csv')



Original cell:
{
  "entity_name": "Actor",
  "detailed_summary": "Tom Hanks, an American actor, producer, and filmmaker, is a renowned figure in the film industry. Born in California, Hanks has roots in the Roman Empire as his ancestors emigrated to the United States in the 18th century, settling in Kentucky and changing their last name to Mefford. He is also a distant relative of the 16th US president, Abraham Lincoln. Hanks gained fame for his roles in films such as Forrest Gump, Saving Private Ryan, and Cast Away. He has won multiple Academy Awards, including two consecutive Best Actor awards for Philadelphia (1993) and Forrest Gump (1994). Hanks also received a Tony Award nomination for Best Actor in a Play for his performance in Nora Ephron's play Lucky Guy in 2013.\n\nIn addition to acting, Hanks has produced movies such as My Big Fat Greek Wedding (2002) and received the American Film Institute's Life Achievement Award in 2002. He appeared in three films in 2004: The Ladykillers

In [20]:

df_combined = pd.DataFrame(Dict_cs)
df_combined.to_csv('/Users/mogen/AdvancedRag/Clustered_nomic/_medium_level_community_summaries.csv')

NameError: name 'Dict_cs' is not defined

In [19]:
combined_dict

NameError: name 'combined_dict' is not defined

In [27]:
path= '/Users/mogen/AdvancedRag/Clustered_nomic/df_inspect.csv'
Cluster_group= pd.read_csv(path)
Cluster_group.head()

Unnamed: 0,Nomic Topic: broad,0
0,Artificial Intelligence (3),"Polygraph Tests, Artificial Intelligence, Psyc..."
1,Behavioral Health,"Medicine (3), Legal Proceedings, Carpets, Floo..."
2,Classic Animated Stories,"Music, Literature (2), Cinderella Film, Publis..."
3,Languages (2),"Arabic Scholars, Arabic Dialects, Linguistics,..."
4,Mexican Cuisine,"Art and Literature, Biodiversity, Political Le..."


In [30]:
len(Cluster_group )

8

In [32]:
# Assuming you have two dataframes: cluster_group and df_combined

# Initialize lists
list1 = []
list2 = []

# Iterate through each row in '0' column of cluster_group
for labels in Cluster_group['0']:
    # Split comma-separated labels
    label_list = [label.strip() for label in labels.split(',')]
    
    # Process each label
    for label in label_list:
        # Look up the label in df_combined['entity_name']
        matching_row = df_combined[df_combined['entity_name'] == label]
        
        # If found, take the corresponding value from 'detailed_summary' and append to list1
        if not matching_row.empty:
            detailed_summary = matching_row['detailed_summary'].values[0]
            list1.append(detailed_summary)
    
    # Append the original list of labels to list2
    list2.append(list1)
    list1=[]

# Display the two lists
print(len(list2))
print(list2)


8
[["Polygraph Tests, also known as lie detectors, are devices used to assess truthfulness by measuring physiological responses such as heart rate, blood pressure, and sweat gland activity. Their use varies across countries, with some employing them in criminal investigations and employment vetting processes, while others have stricter regulations or outright bans due to concerns about their reliability. The technology's origins can be traced back to the work of John Augustus Larson and his wife Elizabeth Holloway Marston, who made significant contributions to its development. Leonarde Keeler later updated the device in 1939, making it portable and incorporating the galvanic skin response feature. The MACKENZIELEWIS POLYGRAPH is another multifunctional device that serves as a polygraph, measuring breathing patterns and changes in heart rate, blood pressure, and respiration. Law enforcement agencies in the United States are significant users of polygraph technology, although the extent 

In [34]:
Cluster_group.head()

Unnamed: 0,Nomic Topic: broad,0,all_summaries
0,Artificial Intelligence (3),"Polygraph Tests, Artificial Intelligence, Psyc...","[Polygraph Tests, also known as lie detectors,..."
1,Behavioral Health,"Medicine (3), Legal Proceedings, Carpets, Floo...",[Medicine (3) refers to a diverse and flexible...
2,Classic Animated Stories,"Music, Literature (2), Cinderella Film, Publis...",[Music is a diverse and influential entity enc...
3,Languages (2),"Arabic Scholars, Arabic Dialects, Linguistics,...",[Arabic Scholars are a diverse group of indivi...
4,Mexican Cuisine,"Art and Literature, Biodiversity, Political Le...",[The Political Leader entity encompasses a div...


In [39]:
def generate_prompt(row):
    entity_name = row['Nomic Topic: broad']
    descriptions = row['all_summaries']
    
    # Start building the prompt
    prompt = f"""
You are an expert summarizer tasked with analyzing a set of entity summaries. Your goal is to generate a comprehensive and detailed summary for the entity "{entity_name}" which represents a high-level community of topics. The following descriptions represent summaries of sub-entities within this broader entity.

- Carefully merge the information from all the provided sub-entity summaries into a single cohesive summary.
- Preserve as much relevant detail as possible from each sub-entity summary.
- Identify and clearly state any implicit knowledge or insights that may not be explicitly mentioned but can be inferred from the context.
- Ensure that the summary highlights the key attributes, relationships, and connections between the sub-entities, showing how they collectively contribute to the high-level community of topics.

Entity Name:
"{entity_name}"

Sub-Entity Summariesn"""z
    
    # Add each summary to the prompt
    for i, description in enumerate(descriptions, 1):
        prompt += f"{i}. {description}\n"
    
    prompt += f"\n**Output Format (Dictionary):**\n\n{{\n  'entity_name': '{entity_name}',\n  'detailed_summary': '[Generated detailed summary merging explicit and implicit details from all sub-entity descriptions, showing their connections and contributions to the high-level community of topics]'\n}}\n"
    
    return prompt

# Apply the function to each row in the dataframe
Cluster_group['formatted_prompt'] = Cluster_group.apply(generate_prompt, axis=1)
Cluster_group['formatted_prompt'][0]

'\nYou are an expert summarizer tasked with analyzing a set of entity summaries. Your goal is to generate a comprehensive and detailed summary for the entity "Artificial Intelligence (3)" which represents a high-level community of topics. The following descriptions represent summaries of sub-entities within this broader entity.\n\n- Carefully merge the information from all the provided sub-entity summaries into a single cohesive summary.\n- Preserve as much relevant detail as possible from each sub-entity summary.\n- Identify and clearly state any implicit knowledge or insights that may not be explicitly mentioned but can be inferred from the context.\n- Ensure that the summary highlights the key attributes, relationships, and connections between the sub-entities, showing how they collectively contribute to the high-level community of topics.\n\n**Entity Name:**\n"Artificial Intelligence (3)"\n\n**Sub-Entity Summaries:**\n1. Polygraph Tests, also known as lie detectors, are devices use

In [None]:
Cluster_group.to_csv('/Users/mogen/AdvancedRag/Clustered_nomic/_high_level_community_summaries.csv')

#combine final dataframe 

In [28]:
import json
from collections import defaultdict

import re

ParserError: Error tokenizing data. C error: Expected 7 fields in line 4, saw 11


#### Run

In [34]:
df = pd.DataFrame()

df['formatted_prompt_list']=formatted_prompt_list

df.to_csv('/Users/mogen/AdvancedRag/Prompts/Prompt_communities/prompts.csv')

In [17]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device


device(type='mps')

In [31]:
access_token= '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'


# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", quantization_config=quantization_config, device_map="mps", token=access_token)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

prompt = "My favourite condiment is"

messages = [
    {"role": "user", "content": "WAS GEHT AB?"},
    ]

model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("mps")

generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
tokenizer.batch_decode(generated_ids)[0]

RuntimeError: No GPU found. A GPU is needed for quantization.

In [None]:

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3",trust_remote_code=True)
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("mps")