In [1]:
# -*- coding: utf-8 -*-
"""
Author: Andrew Burns
Created Date: March 2, 2025
Updated Date: March 7, 2025
"""
from pathlib import Path
from lxml import html
from lxml.html.clean import clean_html

dataset = []

# Directory containing the HTML files
directory = Path(r'C:\git\eviewshelp\chm\test')

# Loop through all .html files in the directory
for html_file in directory.glob('*program*.html'):
    with open(html_file, 'r', encoding='utf-8') as file:
        content = file.read()   
    print(f"Read file: {html_file.name}")  # Optional: Display file name
    parsed_html = html.fromstring(content)
# Clean the HTML
    cleaned_html = clean_html(parsed_html)
# Extract text content
    content= cleaned_html.text_content().strip()
#dataset = clean_html(dataset).text_content().strip()
    print(f'Loaded {len(content)} entries')
    dataset+=" "+content

Read file: commandcmd-program.html
Loaded 868 entries
Read file: cprogram-Control_of_Execution.html
Loaded 18072 entries
Read file: cprogram-EViews_Programming.html
Loaded 1260 entries
Read file: cprogram-Multiple_Program_Files.html
Loaded 3739 entries
Read file: cprogram-Program_Arguments.html
Loaded 3135 entries
Read file: cprogram-Program_Basics.html
Loaded 19852 entries
Read file: cprogram-Program_Modes.html
Loaded 4991 entries
Read file: cprogram-Program_Options.html
Loaded 2800 entries
Read file: cprogram-Program_Variables.html
Loaded 16199 entries
Read file: cprogram-References.html
Loaded 503 entries
Read file: cprogram-Simple_Programs.html
Loaded 6312 entries
Read file: cprogram-Subroutines.html
Loaded 22955 entries
Read file: cprogram-User-Defined_Dialogs.html
Loaded 21704 entries
Read file: cprogram-Version_4_Compatibility_Notes.html
Loaded 9857 entries
Read file: progref-Programming_Language_Summary.html
Loaded 697 entries
Read file: progref-Program_Statements.html
Loaded 1

In [None]:
#%%Begin program
import ollama

EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

# Each element in the VECTOR_DB will be a tuple (chunk, embedding)
# The embedding is a list of floats, for example: [0.1, 0.04, -0.34, 0.21, ...]
VECTOR_DB = []
#%% routines
def add_chunk_to_database(chunk):
  embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
  VECTOR_DB.append((chunk, embedding))
  
def cosine_similarity(a, b):
  dot_product = sum([x * y for x, y in zip(a, b)])
  norm_a = sum([x ** 2 for x in a]) ** 0.5
  norm_b = sum([x ** 2 for x in b]) ** 0.5
  return dot_product / (norm_a * norm_b)

def retrieve(query, top_n=3):
  query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
  # temporary list to store (chunk, similarity) pairs
  similarities = []
  for chunk, embedding in VECTOR_DB:
    similarity = cosine_similarity(query_embedding, embedding)
    similarities.append((chunk, similarity))
  # sort by similarity in descending order, because higher similarity means more relevant chunks
  similarities.sort(key=lambda x: x[1], reverse=True)
  # finally, return the top N most relevant chunks
  return similarities[:top_n]

    

In [4]:
import time

total_chunks = len(dataset)
progress_interval = max(1, total_chunks // 1000)  # Every 0.1% of total chunks
start_time = time.time()

for i, chunk in enumerate(dataset):
    add_chunk_to_database(chunk)
    
    if i % progress_interval == 0 or i == total_chunks - 1:
        elapsed_time = time.time() - start_time
        progress = (i + 1) / total_chunks
        estimated_total_time = elapsed_time / progress if progress > 0 else 0
        time_remaining = estimated_total_time - elapsed_time

        hours, rem = divmod(time_remaining, 3600)
        minutes, seconds = divmod(rem, 60)
        
        print(f'Progress: {progress * 100:.1f}% - {i+1}/{total_chunks} chunks processed')
        print(f'Estimated time remaining: {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}')

Progress: 0.0% - 1/137585 chunks processed
Estimated time remaining: 01:19:06
Progress: 0.1% - 138/137585 chunks processed
Estimated time remaining: 01:13:36
Progress: 0.2% - 275/137585 chunks processed
Estimated time remaining: 01:11:56
Progress: 0.3% - 412/137585 chunks processed
Estimated time remaining: 01:10:50
Progress: 0.4% - 549/137585 chunks processed
Estimated time remaining: 01:10:28
Progress: 0.5% - 686/137585 chunks processed
Estimated time remaining: 01:11:21
Progress: 0.6% - 823/137585 chunks processed
Estimated time remaining: 01:11:54
Progress: 0.7% - 960/137585 chunks processed
Estimated time remaining: 01:11:53
Progress: 0.8% - 1097/137585 chunks processed
Estimated time remaining: 01:11:41
Progress: 0.9% - 1234/137585 chunks processed
Estimated time remaining: 01:11:13
Progress: 1.0% - 1371/137585 chunks processed
Estimated time remaining: 01:10:54
Progress: 1.1% - 1508/137585 chunks processed
Estimated time remaining: 01:10:55
Progress: 1.2% - 1645/137585 chunks pr

In [None]:
#%% process data
#for i, chunk in enumerate(dataset):
##  add_chunk_to_database(chunk)
#  print(f'Added chunk {i+1}/{len(dataset)} to the database')

Added chunk 1/137585 to the database
Added chunk 2/137585 to the database
Added chunk 3/137585 to the database
Added chunk 4/137585 to the database
Added chunk 5/137585 to the database
Added chunk 6/137585 to the database
Added chunk 7/137585 to the database
Added chunk 8/137585 to the database
Added chunk 9/137585 to the database
Added chunk 10/137585 to the database
Added chunk 11/137585 to the database
Added chunk 12/137585 to the database
Added chunk 13/137585 to the database
Added chunk 14/137585 to the database
Added chunk 15/137585 to the database
Added chunk 16/137585 to the database
Added chunk 17/137585 to the database
Added chunk 18/137585 to the database
Added chunk 19/137585 to the database
Added chunk 20/137585 to the database
Added chunk 21/137585 to the database
Added chunk 22/137585 to the database
Added chunk 23/137585 to the database
Added chunk 24/137585 to the database
Added chunk 25/137585 to the database
Added chunk 26/137585 to the database
Added chunk 27/137585

KeyboardInterrupt: 

In [5]:
#%% save the   enumerated database
import pickle
  
def save_vector_db(filename='vector_db.pkl'):
    with open(filename, 'wb') as file:
        pickle.dump(VECTOR_DB, file)
    print(f"VECTOR_DB saved to {filename}")
    
def load_vector_db(filename='vector_db.pkl'):
    global VECTOR_DB
    with open(filename, 'rb') as file:
        VECTOR_DB = pickle.load(file)
    print(f"VECTOR_DB loaded from {filename}")

# Example call to load

#load_vector_db("EviewsInfo.pkl")    

# Example call to save
save_vector_db("EviewsInfo.pkl")
  

VECTOR_DB saved to EviewsInfo.pkl


In [6]:
  
 #%% Do the magic
  
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
  print(f' - (similarity: {similarity:.2f}) {chunk}')

instruction_prompt = f'''You are a helpful chatbot specializing in EViews programming.
Use only the following pieces of context to answer the question. Prioritize this information and do not make up any new information:
{'\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}
'''
#instruction_prompt = f'''You are a helpful chatbot.
#Use only the following pieces of context to answer the question. Don't make up any new information:
#{'\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}
#'''

stream = ollama.chat(
  model=LANGUAGE_MODEL,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)     

Ask me a question:  Using the EViews econeometric language write a loop of a string containing the series variables x y z and create a new series variable called sum which is equal to teh three variables added together.  Only use EViews commands do not use python c or any opther programming language


Retrieved knowledge:
 - (similarity: 0.53) =
 - (similarity: 0.53) =
 - (similarity: 0.53) =
Chatbot response:
Here is an EViews command that creates a loop, adds the three series variables, and then calculates a new series variable as their sum:

```
loop (x = x, y = y, z = z)
sum(x + y + z) := x + y + z
end;
```

This command defines a loop with named iterations for `x`, `y`, and `z`. Inside the loop, it calculates the sum of the three variables using the arithmetic operators `+` and stores the result in `sum(x + y + z)`.

Note that this command assumes that you are running EViews version 8 or earlier. In later versions, you can use `SUM(x + y + z) := x + y + z;` instead of looping over these commands.

In [12]:
print(VECTOR_DB[1])


('p', [-0.010467437, 0.042489957, -0.036645785, 0.01405966, 0.014432011, 0.05906182, -0.0004181107, -0.016029326, -0.016258057, -0.07520244, -0.022527006, -0.044600144, -0.07355539, -0.015139903, 0.0061415923, 0.041846853, 0.041392505, 0.017007584, 0.018186258, -0.011615888, 0.041572433, 0.041390106, 0.023189226, -0.005749395, -0.020055238, 0.016592171, 0.052495293, 0.009966589, -0.016487319, -0.049995262, 0.0056141666, 0.01603666, 0.013888949, -0.00088530476, 0.016068432, -0.005997243, 0.030630913, 0.008960432, -0.008777918, 0.033743437, -0.032206982, -0.0007177639, -0.005882533, -0.028848322, -0.037640207, -0.006902742, -0.0010164713, 0.06386116, -0.040799152, 0.009652419, -0.066942155, 0.022128345, -0.004309547, 0.0010704304, -0.017608603, -0.0108743375, -0.021685157, -0.047673795, 0.023201082, -0.017969409, -0.006947705, 0.033698373, 0.024187202, -0.029984074, 0.039681092, -0.037730284, -0.021764707, 0.04118818, -0.020240434, -0.0027808927, -0.03371502, -0.016884841, 0.0066822926, 