## Create vector store index

This notebook can be used to create and save a vector index creates using text-embedding-ada-002 from OpenAI 

#### Set paths

In [1]:
# Path to root
path_to_root = '/work/PernilleHøjlundBrams#8577/NLP_2023_P'

# To API key file
path_to_key = f'{path_to_root}/config/keys.txt'

# To data folder
path_to_data = f'{path_to_root}/data'

# To where you want to store vector index
path_to_vector_store = f'{path_to_root}/index'

#### Load data

In [2]:
import pandas as pd
df = pd.read_csv(f'{path_to_data}/articles.csv', sep = ",")

### Create context chunks from documents
This section converts the .csv file containing newsarticles into smaller chunks containing the *article body* as the main text and *author*, *URL*, *source*, *date published* and *title* in a metadata dictionary

In [3]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document

# Convert the DataFrame into a list of Document objects that the index can understand
documents = [Document(text=row['Article Body'],
                      metadata={'title': row['Article Header'],
                                'source': row['Source'],
                                'author': row['Author'],
                                'date': row['Published Date'],
                                'url': row['Url']}) for index, row in df.iterrows()] 

### Create servicecontex for the vector index

In [4]:
from llama_index import (
    ServiceContext,
    OpenAIEmbedding,
    PromptHelper,
)
from llama_index.text_splitter import SentenceSplitter

# --- Sentencesplitter to split into chunks
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)


### Split documents into nodes

In [5]:
nodes = text_splitter.get_nodes_from_documents(documents)

In [9]:
nodes_test = pd.DataFrame(nodes)

In [None]:
nodes_test[7][0]

In [14]:
nodes_test.to_csv(f"{path_to_root}/data/prelim_dataframes/nodes.csv")

### Create VectorStore index

In [18]:
# import sys
# sys.path.append(f'{path_to_root}/src')

# from utils import read_api_key

In [19]:
# # --- Load API key
# api_key = read_api_key(path_to_key)

# import os

# # Set the OpenAI API key in the environment variables
# os.environ["OPENAI_API_KEY"] = api_key


In [None]:
# # --- Generate vector index
# index = VectorStoreIndex(nodes,show_progress = True)

# # --- Persist index to disk
# index.storage_context.persist("full_dataset_nodes_index")