In [83]:
#! pip install --upgrade langchain
#! pip install pypdf
#! pip install chromadb
#! pip install tiktoken
#! pip install --upgrade openai
#! pip install transformers diffusers
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
import pandas as pd
import os
import pprint
import numpy as np
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from dotenv import load_dotenv


In [4]:
# Get the api_key from the environment variables
api_key = os.getenv("OPENAI_API_KEY")


In [5]:
print(api_key)


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


In [7]:
filepath = '../processed_data/data_cleaned_Rui/deaths_season_episode.csv'
df = pd.read_csv(filepath)


In [8]:
df.head()


Unnamed: 0,Season,Episode,Deaths
0,Season 1,Winter Is Coming,Seven Unnamed Free Folk - Ripped apart by Whit...
1,Season 1,Winter Is Coming,Unnamed Free Folk Girl - Killed and pinned to ...
2,Season 1,Winter Is Coming,Ser Waymar Royce - Slashed by a White Walker w...
3,Season 1,Winter Is Coming,Gared - Decapitated by a White Walker with an ...
4,Season 1,Winter Is Coming,Three Unspecified Animals - Killed by an unkno...


In [9]:
# FILEPATH: /Users/ruifspinto/game-of-thrones-survival/notebooks/rui_genAI.ipynb
loader = CSVLoader(file_path=filepath, csv_args={
    'delimiter': ',',
    'fieldnames': ['Season', 'Episode', 'Deaths']})

data = loader.load()

pprint.pprint(data)


[Document(page_content='Season: Season\nEpisode: Episode\nDeaths: Deaths', metadata={'source': '../processed_data/data_cleaned_Rui/deaths_season_episode.csv', 'row': 0}),
 Document(page_content='Season: Season 1\nEpisode: Winter Is Coming\nDeaths: Seven Unnamed Free Folk - Ripped apart by White Walkers off-screen, bodies shown.', metadata={'source': '../processed_data/data_cleaned_Rui/deaths_season_episode.csv', 'row': 1}),
 Document(page_content='Season: Season 1\nEpisode: Winter Is Coming\nDeaths: Unnamed Free Folk Girl - Killed and pinned to a tree by White Walkers off-screen, body shown. Reanimated as a wight.', metadata={'source': '../processed_data/data_cleaned_Rui/deaths_season_episode.csv', 'row': 2}),
 Document(page_content='Season: Season 1\nEpisode: Winter Is Coming\nDeaths: Ser Waymar Royce - Slashed by a White Walker with an ice blade.', metadata={'source': '../processed_data/data_cleaned_Rui/deaths_season_episode.csv', 'row': 3}),
 Document(page_content='Season: Season 1\

In [10]:
print (f'You have {len(data)} documents in your data')
print (f'''There are ~{np.mean([len(x.page_content) for x in data])} characters per row on average.''')


You have 1294 documents in your data
There are ~123.80757341576506 characters per row on average.


In [11]:
for document in data:
    season = document.page_content.split('\n')[0].split(': ')[1]
    episode = document.page_content.split('\n')[1].split(': ')[1]
    deaths = document.page_content.split('\n')[2].split(': ')[1]

    print(f"Season: {season}")
    print(f"Episode: {episode}")
    print(f"Deaths: {deaths}")
    print("--------------------")


Season: Season
Episode: Episode
Deaths: Deaths
--------------------
Season: Season 1
Episode: Winter Is Coming
Deaths: Seven Unnamed Free Folk - Ripped apart by White Walkers off-screen, bodies shown.
--------------------
Season: Season 1
Episode: Winter Is Coming
Deaths: Unnamed Free Folk Girl - Killed and pinned to a tree by White Walkers off-screen, body shown. Reanimated as a wight.
--------------------
Season: Season 1
Episode: Winter Is Coming
Deaths: Ser Waymar Royce - Slashed by a White Walker with an ice blade.
--------------------
Season: Season 1
Episode: Winter Is Coming
Deaths: Gared - Decapitated by a White Walker with an ice blade.
--------------------
Season: Season 1
Episode: Winter Is Coming
Deaths: Three Unspecified Animals - Killed by an unknown person off-screen, bodies shown.
--------------------
Season: Season 1
Episode: Winter Is Coming
Deaths: Will - Decapitated by Lord Eddard Stark with Ice for dissertation.
--------------------
Season: Season 1
Episode: Winte

In [12]:
embedder = OpenAIEmbeddings(openai_api_type=api_key)


In [13]:
vector_db = Chroma.from_documents(data, embedder)


In [14]:
text = "John Stark"
query_result = embedder.embed_query(text)
query_result[:5]


[-0.013127450386259742,
 -6.39016558292593e-05,
 -0.004382548705201905,
 -0.010374052098416716,
 -0.006519967003444813]

In [37]:
query = "When did Beric died"
docs = vector_db.similarity_search(query, k = 5)


In [40]:
for doc in docs:
    print(doc.page_content)


Season: Season 8
Episode: The Iron Throne
Deaths: Lord Beric Dondarrion - 22 (Deceased)
Season: Season 8
Episode: The Long Night
Deaths: Lord Beric Dondarrion - Stabbed to death by wight. Reanimated as a wight.
Season: Season 8
Episode: The Long Night
Deaths: Lord Beric Dondarrion (Wight) - Died when Arya Stark killed the Night King.
Season: Season 3
Episode: Kissed by Fire
Deaths: Lord Beric Dondarrion - Stabbed in the stomach by an unknown person off-screen, mentioned. Resurrected by Thoros via the Lord of Light.
Season: Season 3
Episode: Kissed by Fire
Deaths: Lord Beric Dondarrion - Shot in the back by an unknown person with an arrow off-screen, mentioned. Resurrected by Thoros via the Lord of Light.


In [44]:
from langchain.llms import OpenAI # Choosing which LLM
from langchain.chains.question_answering import load_qa_chain


In [45]:
llm = OpenAI(temperature=0, openai_api_key=api_key) # finetuning the LLM
chain = load_qa_chain(llm, chain_type="map_reduce") #verbose=True to check under the hood


In [69]:
query = "When most solders died"
docs = vector_db.similarity_search(query, k=1)


In [70]:
for doc in docs:
    print(doc.page_content)


Season: Season 6
Episode: Battle of the Bastards
Deaths: 5,950 Unnamed Bolton Soldiers - Killed by unknown Northern soldiers and knights of the Vale.


In [78]:
data = docs[0].page_content
season = data.split('Season: ')[1].split('\n')[0]
episode = data.split('Episode: ')[1].split('\n')[0]
deaths = data.split('Deaths: ')[1]

print(f"Season: {season}")
print(f"Episode: {episode}")
print(f"Deaths: {deaths}")


Season: Season 6
Episode: Battle of the Bastards
Deaths: 5,950 Unnamed Bolton Soldiers - Killed by unknown Northern soldiers and knights of the Vale.


In [74]:
df = pd.read_csv('../processed_data/data_cleaned_Carmen/20231129_char_preds_filtered_only_dead.csv')

list_of_names = df['name'].to_list()


In [79]:
# create a dataframe to store the results
df = pd.DataFrame(columns=['name', 'season', 'episode', 'deaths'])

for name in list_of_names:
    # build query for for when name died
    query = f"When did {name} died"
    docs = vector_db.similarity_search(query, k=1)

    # get the data from the document
    data = docs[0].page_content
    season = data.split('Season: ')[1].split('\n')[0]
    episode = data.split('Episode: ')[1].split('\n')[0]
    deaths = data.split('Deaths: ')[1]

    # add the data to the dataframe
    df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)


  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': name, 'season': season, 'episode': episode, 'deaths': deaths}, ignore_index=True)
  df = df.append({'name': na

In [85]:
df.head()


Unnamed: 0,name,season,episode,deaths
0,Viserys II Targaryen,Season 8,The Bells,Lord Varys - Burned alive by Drogon with his f...
1,Walder Frey,Season 6,The Winds of Winter,Lord Walder Frey - Throat slit by Arya Stark w...
2,Aemma Arryn,Season 5,The Gift,Maester Aemon Targaryen - Died of natural causes.
3,Tommen Baratheon,Season 2,Blackwater,Unnamed Baratheon Soldier - Stabbed in the che...
4,Valarr Targaryen,Season 2,Valar Morghulis,Doreah - Died of natural causes after being lo...


In [81]:
# save the dataframe to a csv file
df.to_csv('../processed_data/data_cleaned_rui/missing_deads_season_episode.csv', index=False)
