Create a folder called: "classifier". Download the classifier from https://huggingface.co/nickmuchi/finbert-tone-finetuned-finance-topic-classification/tree/main
Save everything inside a folder named "classifier". Download the following files:
* config.json;
* pytorch_model.bin;
* special_tokens_map.json;
* tokenizer_config.json;
* tokenizer.json;
* vocab.txt;

# Imports and Downloads

In [None]:
!pip install requests beautifulsoup4 urllib3 selenium webdriver-manager transformers torch nltk

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin, urlparse
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import nltk

# Crawler

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Crawler:
    def __init__(self, start_url, max_depth=3, max_pages=100, max_workers=10, selenium_workers=2, save_interval=10):
        self.start_url = start_url
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.visited = set()
        self.results = []
        self.robot_parsers = {}
        self.max_workers = max_workers
        self.selenium_workers = selenium_workers
        self.save_interval = save_interval
        self.pages_crawled = 0
        self.session = requests.Session()
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        self.session.mount('http://', HTTPAdapter(max_retries=retries))
        self.session.mount('https://', HTTPAdapter(max_retries=retries))
        self.selenium_pool = []

    def init_selenium(self):
        options = Options()
        options.add_argument("-headless")
        options.set_preference("network.cookie.cookieBehavior", 0)  # Accept all cookies
        driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)
        return driver

    def accept_cookies(self, driver):
        try:
            WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept') or contains(text(), 'Aceitar')]"))
            ).click()
        except:
            logging.info("No cookie acceptance button found or not clickable")

    def fetch_page(self, url, use_selenium=False):
        if use_selenium:
            driver = self.selenium_pool.pop()
            try:
                driver.get(url)
                self.accept_cookies(driver)
                html = driver.page_source
                return html
            except Exception as e:
                logging.error(f"Selenium request failed: {e}")
                return None
            finally:
                self.selenium_pool.append(driver)
        else:
            try:
                response = self.session.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
                if response.status_code == 200:
                    return response.text
                else:
                    return None
            except requests.RequestException as e:
                logging.error(f"Request failed: {e}")
                return None

    def parse_robots(self, url):
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        if base_url not in self.robot_parsers:
            robots_url = urljoin(base_url, "/robots.txt")
            rp = RobotFileParser()
            rp.set_url(robots_url)
            try:
                rp.read()
            except Exception as e:
                logging.error(f"Failed to read robots.txt: {e}")
            self.robot_parsers[base_url] = rp
        return self.robot_parsers[base_url]

    def is_allowed(self, url):
        rp = self.parse_robots(url)
        return rp.can_fetch("*", url)

    def extract_info(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.find('title').text.strip() if soup.find('title') else 'No Title'
        text = ' '.join([p.text.strip() for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
        links = list(set([urljoin(url, a['href']) for a in soup.find_all('a', href=True)]))
        
        pub_time = None
        meta_time = soup.find('meta', property='article:published_time')
        if meta_time:
            pub_time = meta_time['content']
        else:
            for name in ['pubdate', 'publishdate', 'timestamp', 'date']:
                meta = soup.find('meta', attrs={'name': name})
                if meta:
                    pub_time = meta['content']
                    break
        
        if pub_time:
            try:
                pub_time = datetime.fromisoformat(pub_time).isoformat()
            except ValueError:
                logging.warning(f"Could not parse publication time: {pub_time}")
                pub_time = None

        return {
            'url': url,
            'title': title,
            'text': text,
            'links': links,
            'pub_time': pub_time,
            'crawl_time': datetime.now().isoformat()
        }

    def save_results(self, filename='results.json'):
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=4, ensure_ascii=False)
        logging.info(f"Saved {len(self.results)} results to {filename}")
        self.results.clear()

    def crawl(self, url, depth=0):
        if depth > self.max_depth or len(self.visited) >= self.max_pages or url in self.visited:
            return []

        if not self.is_allowed(url):
            return []

        logging.info(f"Crawling: {url}")
        self.visited.add(url)

        use_selenium = False
        html = self.fetch_page(url)
        if not html or "javascript required" in html.lower():
            use_selenium = True
            html = self.fetch_page(url, use_selenium=True)

        if html:
            info = self.extract_info(url, html)
            self.results.append(info)
            self.pages_crawled += 1
            
            if self.pages_crawled % self.save_interval == 0:
                self.save_results(f'partial_results_{self.pages_crawled}.json')
            
            return info['links']
        
        time.sleep(1)  # Respectful crawling
        return []

    def run(self):
        for _ in range(self.selenium_workers):
            self.selenium_pool.append(self.init_selenium())

        try:
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                future_to_url = {executor.submit(self.crawl, self.start_url): self.start_url}
                while future_to_url and len(self.visited) < self.max_pages:
                    for future in as_completed(future_to_url):
                        url = future_to_url.pop(future)
                        try:
                            links = future.result()
                            depth = urlparse(url).path.count('/')
                            if depth < self.max_depth:
                                for link in links:
                                    if link not in self.visited and len(future_to_url) < self.max_workers:
                                        future_to_url[executor.submit(self.crawl, link)] = link
                        except Exception as exc:
                            logging.error(f"Exception for {url}: {exc}")
        finally:
            for driver in self.selenium_pool:
                driver.quit()
            
        self.save_results('final_results.json')

if __name__ == "__main__":
    start_url = "https://example.com"
    crawler = Crawler(start_url, max_depth=10, max_pages=150, max_workers=6, selenium_workers=2, save_interval=30)
    crawler.run()

# Indexer

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

class AIIndexer:
    def __init__(self, input_file='./final_results.json', output_file='alexandrya_ai.json', model_path='./classifier'):
        self.input_file = input_file
        self.output_file = output_file
        self.model_path = model_path
        self.indexed_data = []

        # Load AI model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.labels = [
            "Analyst Update", "Fed | Central Banks", "Company | Product News",
            "Treasuries | Corporate Debt", "Dividend", "Earnings", "Energy | Oil",
            "Financials", "Currencies", "General News | Opinion", "Gold | Metals | Materials",
            "IPO", "Legal | Regulation", "M&A | Investments", "Macro", "Markets",
            "Politics", "Personnel Change", "Stock Commentary", "Stock Movement"
        ]

    def classify_text(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        logits = outputs.logits
        scores = torch.softmax(logits, dim=1).squeeze().tolist()
        
        results = [{"label": label, "score": score} for label, score in zip(self.labels, scores)]
        results = sorted(results, key=lambda x: x["score"], reverse=True)
        
        return results

    def score_url(self, url):
        score = 0
        parsed_url = urlparse(url)
        
        # HTTPS
        if parsed_url.scheme == 'https':
            score += 5
        
        # URL length
        score += min(len(url) // 10, 5)  # Max 5 points for length
        
        # Domain
        domain = parsed_url.netloc
        if domain.endswith('.com'):
            score += 3
        elif domain.endswith('.br'):
            score += 4
        elif domain.endswith('.gov'):
            score += 5
        elif domain.endswith('.edu'):
            score += 5
        
        return score

    def index(self):
        with open(self.input_file, 'r', encoding='utf-8') as f:
            results = json.load(f)
        
        for item in results:
            classification_results = self.classify_text(item['text'])
            url_score = self.score_url(item['url'])
            
            indexed_item = {
                'url': item['url'],
                'title': item['title'],
                'text': item['text'],
                'links': item['links'],  # Adicionando o campo 'links'
                'classification_results': classification_results,
                'url_score': url_score
            }
            self.indexed_data.append(indexed_item)
        
        with open(self.output_file, 'w', encoding='utf-8') as f:
            json.dump(self.indexed_data, f, ensure_ascii=False, indent=4)

        print(f"Indexing complete. Results saved to {self.output_file}")

    def search(self, query, top_n=5):
        query_classification = self.classify_text(query)
        top_query_label = query_classification[0]['label']
        
        results = []
        for item in self.indexed_data:
            item_top_label = item['classification_results'][0]['label']
            if item_top_label == top_query_label:
                results.append(item)
        
        # Sort results by the score of the matching label
        results.sort(key=lambda x: next(r['score'] for r in x['classification_results'] if r['label'] == top_query_label), reverse=True)
        
        return results[:top_n]

if __name__ == "__main__":
    indexer = AIIndexer()
    indexer.index()

# PageRank

Some slow, will be resolved

In [None]:
import json
import math
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

class SearchEngine:
    def __init__(self, indexed_file='alexandrya_ai.json'):
        with open(indexed_file, 'r', encoding='utf-8') as f:
            self.indexed_data = json.load(f)
        
        self.stop_words = set(stopwords.words('english'))
        self.avg_doc_length = self.calculate_avg_doc_length()
        self.pagerank_scores = self.calculate_pagerank()

    def calculate_pagerank(self, damping_factor=0.85, num_iterations=100):
        num_pages = len(self.indexed_data)
        initial_value = 1.0 / num_pages
        pagerank = {item['url']: initial_value for item in self.indexed_data}

        url_to_index = {item['url']: i for i, item in enumerate(self.indexed_data)}

        for _ in range(num_iterations):
            new_pagerank = {}
            for item in self.indexed_data:
                url = item['url']
                incoming_pr = 0
                for other_item in self.indexed_data:
                    if 'links' in other_item and url in other_item['links']:
                        incoming_pr += pagerank[other_item['url']] / len(other_item['links'])
                new_pagerank[url] = (1 - damping_factor) / num_pages + damping_factor * incoming_pr

            pagerank = new_pagerank

        return pagerank

    def calculate_avg_doc_length(self):
        total_length = sum(len(self.tokenize(item['text'])) for item in self.indexed_data)
        return total_length / len(self.indexed_data)

    def tokenize(self, text):
        tokens = word_tokenize(text.lower())
        return [token for token in tokens if token.isalnum() and token not in self.stop_words]

    def compute_bm25_score(self, query, document, k1=1.5, b=0.75):
        query_terms = self.tokenize(query)
        doc_terms = self.tokenize(document)
        doc_length = len(doc_terms)
        term_freqs = Counter(doc_terms)
        
        score = 0
        for term in query_terms:
            if term in term_freqs:
                idf = math.log((len(self.indexed_data) - len([d for d in self.indexed_data if term in self.tokenize(d['text'])])) + 0.5) - \
                      math.log(len([d for d in self.indexed_data if term in self.tokenize(d['text'])]) + 0.5)
                tf = term_freqs[term]
                numerator = tf * (k1 + 1)
                denominator = tf + k1 * (1 - b + b * doc_length / self.avg_doc_length)
                score += idf * numerator / denominator
        
        return score

    def get_snippet(self, text, query, snippet_length=200):
        query_terms = set(self.tokenize(query))
        words = text.split()
        best_start = 0
        max_matches = 0

        for i in range(len(words) - snippet_length):
            snippet = ' '.join(words[i:i+snippet_length])
            matches = sum(1 for term in query_terms if term in self.tokenize(snippet))
            if matches > max_matches:
                max_matches = matches
                best_start = i

        return ' '.join(words[best_start:best_start+snippet_length]) + '...'

    def search(self, query, top_n=5):
        results = []
        for item in self.indexed_data:
            bm25_score = self.compute_bm25_score(query, item['text'])
            url_score = item.get('url_score', 0)
            pagerank_score = self.pagerank_scores.get(item['url'], 0)
            
            if 'classification_results' in item and item['classification_results']:
                classification_score = item['classification_results'][0]['score']
            else:
                classification_score = 0
            
            total_score = (bm25_score * 0.4 + url_score * 0.1 + pagerank_score * 0.3 + classification_score * 0.2)
            
            snippet = self.get_snippet(item['text'], query)
            
            results.append({
                'url': item['url'],
                'title': item['title'],
                'score': total_score,
                'snippet': snippet
            })
        
        results.sort(key=lambda x: x['score'], reverse=True)
        return results[:top_n]

if __name__ == "__main__":
    search_engine = SearchEngine()
    query = ""
    search_results = search_engine.search(query)
    for result in search_results:
        print(f"URL: {result['url']}")
        print(f"Title: {result['title']}")
        print(f"Score: {result['score']}")
        print(f"Snippet: {result['snippet']}")
        print("---")