<a href="https://colab.research.google.com/github/GPapadakis77/ai_tax_opportunity_identifier/blob/main/ai_tax_opportunity_identifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Σύνδεση με Google Drive & Δημιουργία Βασικής Δομής Project

In [1]:
import os
from google.colab import drive

print("Connecting to Google Drive...")
drive.mount('/content/drive')
print("Connection to Google Drive completed.")

base_project_path = '/content/drive/My Drive/AI_Tax_Opportunity_Identifier'
try:
    os.makedirs(base_project_path, exist_ok=True)
    print(f"Base project folder created (or already exists) at: {base_project_path}")
except Exception as e:
    print(f"Error while creating the project folder: {e}")
    print("Please make sure Google Drive is properly mounted.")

try:
    %cd {base_project_path}
    print(f"Current working directory: {os.getcwd()}")

    folders = ['data_ingestion', 'nlp_processing', 'opportunity_identification', 'database', 'frontend', 'utils']
    for folder in folders:
        os.makedirs(folder, exist_ok=True)
        init_file_path = os.path.join(folder, '__init__.py')
        if not os.path.exists(init_file_path):
            open(init_file_path, 'a').close()
            print(f"Created folder: {folder} and __init__.py")
        else:
            print(f"Folder: {folder} and __init__.py already exist.")

    root_files = ['main.py', 'config.py', 'requirements.txt', 'README.md']
    for f_name in root_files:
        if not os.path.exists(f_name):
            open(f_name, 'a').close()
            print(f"Created file: {f_name}")
        else:
            print(f"File: {f_name} already exists.")

    print("\nProject base structure successfully created/verified!")
except Exception as e:
    print(f"Failed to change to project folder or create project structure: {e}")
    print("Please double-check your Google Drive connection and path.")


Connecting to Google Drive...
Mounted at /content/drive
Connection to Google Drive completed.
Base project folder created (or already exists) at: /content/drive/My Drive/AI_Tax_Opportunity_Identifier
/content/drive/My Drive/AI_Tax_Opportunity_Identifier
Current working directory: /content/drive/My Drive/AI_Tax_Opportunity_Identifier
Folder: data_ingestion and __init__.py already exist.
Folder: nlp_processing and __init__.py already exist.
Folder: opportunity_identification and __init__.py already exist.
Folder: database and __init__.py already exist.
Folder: frontend and __init__.py already exist.
Folder: utils and __init__.py already exist.
File: main.py already exists.
File: config.py already exists.
File: requirements.txt already exists.
File: README.md already exists.

Project base structure successfully created/verified!


Εγκατάσταση Βασικών Βιβλιοθηκών & Ρύθμιση config.py

In [2]:
import os
import sys  # Added sys for using sys.path.insert

# Make sure you are in the project folder for proper installation and logging
base_project_path = '/content/drive/My Drive/AI_Tax_Opportunity_Identifier'
%cd {base_project_path}

# Installing core libraries
print("Installing core libraries...")
!pip install requests beautifulsoup4 lxml pandas spacy streamlit pyngrok

# Downloading the Greek Spacy model
print("Downloading the Greek Spacy model...")
try:
    !python -m spacy download el_core_news_sm
except Exception as e:
    print(f"Issue while downloading the el_core_news_sm model (it might already exist): {e}")

print("\nCore libraries installed and the model was downloaded successfully (or already existed)!")

# Create content for a minimal requirements.txt
lite_requirements_content = """
requests
beautifulsoup4
pandas
spacy
streamlit
pyngrok
"""
requirements_file_path = os.path.join(base_project_path, 'requirements.txt')
with open(requirements_file_path, 'w') as f:
    f.write(lite_requirements_content.strip())
print(f"The file {requirements_file_path} has been updated with minimal requirements.")

# Create content for config.py (no comments and includes CAPITAL_NEWS_URL)
config_content = """
GOV_GAZETTE_BASE_URL = "https://www.et.gr"
GOV_GAZETTE_SEARCH_URL = "https://search.et.gr/el/"
MINISTRY_FINANCE_NEWS_URL = "https://www.minfin.gr/news"
AADE_NEWS_URL = "https://www.aade.gr/deltia-typoy-anakoinoseis"
NAFTEMPORIKI_TAX_URL = "https://www.naftemporiki.gr/finance/tax/"
KATHIMERINI_ECONOMY_URL = "https://www.kathimerini.gr/economy/"
CAPITAL_NEWS_URL = "https://www.capital.gr/epikairotita"
TAX_KEYWORDS = [
    "φορολογία", "φορολογικές αλλαγές", "φορολογικός νόμος", "φορολογικές διατάξεις",
    "ΦΠΑ", "εισόδημα", "ακίνητα", "κεφάλαιο", "ΑΑΔΕ", "φορολογικός έλεγχος",
    "παράταση", "τροποποίηση", "νέο νομοσχέδιο", "κίνητρα", "επιδοτήσεις",
    "φορολογικές δηλώσεις", "ηλεκτρονικά βιβλία", "mydata",
    "διπλογραφικά", "λογιστικά", "κώδικας φορολογίας"
]
DATABASE_NAME = "tax_opportunities.db"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
HEADERS = {'User-Agent': USER_AGENT}
"""

# Write the content to config.py
config_file_path = os.path.join(base_project_path, 'config.py')
with open(config_file_path, 'w') as f:
    f.write(config_content.strip())
print(f"The file {config_file_path} has been updated with configuration settings.")


/content/drive/My Drive/AI_Tax_Opportunity_Identifier
Installing core libraries...
Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.12-py3-none-any.whl (26 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m67.1 MB/s[0m eta

SCRAPPER CALL

In [3]:
import os
import sys

# Enable auto-reloading of modules (crucial for changes to .py files)
%load_ext autoreload
%autoreload 2

# Ensure you are in the project's root folder
base_project_path = '/content/drive/My Drive/AI_Tax_Opportunity_Identifier'
%cd {base_project_path}

# Add the project root to the PATH for module discovery
if base_project_path not in sys.path:
    sys.path.insert(0, base_project_path)

# Import and execute the scraper
# Due to %autoreload, the latest changes from legislative_scraper.py and config.py will be loaded
from data_ingestion import legislative_scraper

print("\n--- Running the Legislative News Scraper from MinFin and AADE ---")
latest_legislative_news_df = legislative_scraper.get_latest_legislative_news()

if not latest_legislative_news_df.empty:
    print("\nLegislative News Results:")
    display(latest_legislative_news_df.head(10)) # Display the first 10 results
    print(f"\nTotal {len(latest_legislative_news_df)} entries found.")
else:
    print("\nScraper found no data or failed to parse it.")
    print("Possible causes: Changes in website structure (selectors need updating) or temporary network issues.")
    print("Note: The search.et.gr/el/ website loads content dynamically with JavaScript. This scraper does not fully support it.")

/content/drive/My Drive/AI_Tax_Opportunity_Identifier

--- Running the Legislative News Scraper from MinFin and AADE ---
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.minfin.gr/news
Επιτυχής ανάκτηση σελίδας https://www.minfin.gr/news
Βρέθηκαν 21 νέα από Υπουργείο Οικονομικών.
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.capital.gr/epikairotita
Επιτυχής ανάκτηση σελίδας https://www.capital.gr/epikairotita
Βρέθηκαν 17 νέα από Capital.gr.
Συνολικά βρέθηκαν 21 νέα/ανακοινώσεις από τις πηγές (με έγκυρη ημερομηνία).

Legislative News Results:


Unnamed: 0,title,url,date,source,id
0,Νέο Διοικητικό Συμβούλιο στην Ελληνική Αναπτυξ...,https://minfin.gov.gr/neo-dioikitiko-symvoulio...,2025-07-16,Υπουργείο Οικονομικών,https://minfin.gov.gr/neo-dioikitiko-symvoulio...
1,Γιώργος Κώτσηρας: Μειώνουμε τη γραφειοκρατία κ...,https://minfin.gov.gr/giorgos-kotsiras-meionou...,2025-07-15,Υπουργείο Οικονομικών,https://minfin.gov.gr/giorgos-kotsiras-meionou...
2,Διακήρυξη μειοδοτικής δημοπρασίας για τη μίσθω...,https://minfin.gov.gr/diakiryxi-meiodotikis-di...,2025-07-15,Υπουργείο Οικονομικών,https://minfin.gov.gr/diakiryxi-meiodotikis-di...
3,Κυριάκος Πιερρακάκης: «Η δημόσια περιουσία ανή...,https://minfin.gov.gr/kyriakos-pierrakakis-i-d...,2025-07-15,Υπουργείο Οικονομικών,https://minfin.gov.gr/kyriakos-pierrakakis-i-d...
4,Καθορισμός οριογραμμής παραλίας στην Τ.Κ. Σκου...,https://minfin.gov.gr/kathorismos-oriogrammis-...,2025-07-10,Υπουργείο Οικονομικών,https://minfin.gov.gr/kathorismos-oriogrammis-...
5,Περίληψη διακήρυξης Δ’ επαναληπτικής μειοδοτικ...,https://minfin.gov.gr/perilipsi-diakiryxis-d-e...,2025-07-10,Υπουργείο Οικονομικών,https://minfin.gov.gr/perilipsi-diakiryxis-d-e...
6,Διακήρυξη Μειοδοτικής Δημοπρασίας Μίσθωσης Ακι...,https://minfin.gov.gr/diakiryxi-meiodotikis-di...,2025-07-09,Υπουργείο Οικονομικών,https://minfin.gov.gr/diakiryxi-meiodotikis-di...
7,Καθορισμός οριογραμμής παραλίας στη θέση Παγαν...,https://minfin.gov.gr/kathorismos-oriogrammis-...,2025-07-09,Υπουργείο Οικονομικών,https://minfin.gov.gr/kathorismos-oriogrammis-...
8,Ειδοποίηση για δικάσιμο κατά την 10η Οκτωβρίου...,https://minfin.gov.gr/eidopoiisi-gia-dikasimo-...,2025-07-08,Υπουργείο Οικονομικών,https://minfin.gov.gr/eidopoiisi-gia-dikasimo-...
9,Απόφαση καθορισμού οριογραμμών παραλίας και επ...,https://minfin.gov.gr/apofasi-kathorismou-orio...,2025-07-03,Υπουργείο Οικονομικών,https://minfin.gov.gr/apofasi-kathorismou-orio...



Total 21 entries found.


DBManager CALL

In [4]:
import os
import sys

# Enable auto-reloading of modules (crucial for changes in .py files)
%load_ext autoreload
%autoreload 2

# Ensure you are in the project's root folder
base_project_path = '/content/drive/My Drive/AI_Tax_Opportunity_Identifier'
%cd {base_project_path}

# Add the root folder to the PATH so modules can be discovered
if base_project_path not in sys.path:
    sys.path.insert(0, base_project_path)

# Import Scraper and DBManager
from data_ingestion import legislative_scraper
from database.db_manager import DBManager  # <-- New import!

print("\n--- Running Scraper & Storing Data to the Database ---")

# 1. Run the Scraper
latest_legislative_news_df = legislative_scraper.get_latest_legislative_news()

# 2. Store data in the database
if not latest_legislative_news_df.empty:
    db_manager = DBManager()
    db_manager.connect()
    db_manager.create_table()  # Create table if it doesn't exist

    print("\nAttempting to insert new data into the database...")
    db_manager.insert_opportunities(latest_legislative_news_df)

    # 3. Retrieve and display all data from the database
    print("\nAll current entries in the database:")
    all_stored_data_df = db_manager.fetch_all_opportunities()
    if not all_stored_data_df.empty:
        display(all_stored_data_df.head(10))  # Display the first 10 entries
        print(f"\nTotal of {len(all_stored_data_df)} entries found in the database.")
    else:
        print("No entries found in the database.")

    db_manager.close()

else:
    print("\nNo new data found by the scrapers to store.")

print("\nProcess completed.")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/content/drive/My Drive/AI_Tax_Opportunity_Identifier

--- Running Scraper & Storing Data to the Database ---
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.minfin.gr/news
Επιτυχής ανάκτηση σελίδας https://www.minfin.gr/news
Βρέθηκαν 21 νέα από Υπουργείο Οικονομικών.
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.capital.gr/epikairotita
Επιτυχής ανάκτηση σελίδας https://www.capital.gr/epikairotita
Βρέθηκαν 17 νέα από Capital.gr.
Συνολικά βρέθηκαν 21 νέα/ανακοινώσεις από τις πηγές (με έγκυρη ημερομηνία).
Successfully connected to the database: /content/drive/My Drive/AI_Tax_Opportunity_Identifier/data/tax_opportunities.db
Existing 'opportunities' table dropped (if it existed).
The 'opportunities' table has been created (or re-created) with the latest schema.

Attempting to insert new data into the database...
Insertion/update of 21 opportunities completed in the database.

All current entries i

Unnamed: 0,id,title,url,date,source,full_text,keywords,entities,main_topic,sentiment,opportunity_score,opportunity_type,added_date
0,https://minfin.gov.gr/neo-dioikitiko-symvoulio...,Νέο Διοικητικό Συμβούλιο στην Ελληνική Αναπτυξ...,https://minfin.gov.gr/neo-dioikitiko-symvoulio...,2025-07-16,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
1,https://minfin.gov.gr/giorgos-kotsiras-meionou...,Γιώργος Κώτσηρας: Μειώνουμε τη γραφειοκρατία κ...,https://minfin.gov.gr/giorgos-kotsiras-meionou...,2025-07-15,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
2,https://minfin.gov.gr/diakiryxi-meiodotikis-di...,Διακήρυξη μειοδοτικής δημοπρασίας για τη μίσθω...,https://minfin.gov.gr/diakiryxi-meiodotikis-di...,2025-07-15,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
3,https://minfin.gov.gr/kyriakos-pierrakakis-i-d...,Κυριάκος Πιερρακάκης: «Η δημόσια περιουσία ανή...,https://minfin.gov.gr/kyriakos-pierrakakis-i-d...,2025-07-15,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
4,https://minfin.gov.gr/kathorismos-oriogrammis-...,Καθορισμός οριογραμμής παραλίας στην Τ.Κ. Σκου...,https://minfin.gov.gr/kathorismos-oriogrammis-...,2025-07-10,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
5,https://minfin.gov.gr/perilipsi-diakiryxis-d-e...,Περίληψη διακήρυξης Δ’ επαναληπτικής μειοδοτικ...,https://minfin.gov.gr/perilipsi-diakiryxis-d-e...,2025-07-10,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
6,https://minfin.gov.gr/diakiryxi-meiodotikis-di...,Διακήρυξη Μειοδοτικής Δημοπρασίας Μίσθωσης Ακι...,https://minfin.gov.gr/diakiryxi-meiodotikis-di...,2025-07-09,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
7,https://minfin.gov.gr/kathorismos-oriogrammis-...,Καθορισμός οριογραμμής παραλίας στη θέση Παγαν...,https://minfin.gov.gr/kathorismos-oriogrammis-...,2025-07-09,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
8,https://minfin.gov.gr/eidopoiisi-gia-dikasimo-...,Ειδοποίηση για δικάσιμο κατά την 10η Οκτωβρίου...,https://minfin.gov.gr/eidopoiisi-gia-dikasimo-...,2025-07-08,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55
9,https://minfin.gov.gr/apofasi-kathorismou-orio...,Απόφαση καθορισμού οριογραμμών παραλίας και επ...,https://minfin.gov.gr/apofasi-kathorismou-orio...,2025-07-03,Υπουργείο Οικονομικών,,,,,,,,2025-07-16 11:48:55



Total of 19 entries found in the database.
Database connection closed.

Process completed.


nlp_processor CALL

In [5]:
import os
import sys

# Enable automatic module reloading (critical for changes in .py files)
%load_ext autoreload
%autoreload 2

# Ensure you are in the project's root directory
base_project_path = '/content/drive/My Drive/AI_Tax_Opportunity_Identifier'
%cd {base_project_path}

# Add the project root to the PATH to allow module discovery
if base_project_path not in sys.path:
    sys.path.insert(0, base_project_path)

# Import Scraper, DBManager, and NLPProcessor
from data_ingestion import legislative_scraper
from database.db_manager import DBManager
from nlp_processing.nlp_processor import NLPProcessor  # <-- New import!

print("\n--- Running Scraper, NLP Processor & Storing Data to the Database ---")

# 1. Run the Scraper
latest_legislative_news_df = legislative_scraper.get_latest_legislative_news()

# 2. Process data using NLP
if not latest_legislative_news_df.empty:
    try:
        nlp_processor = NLPProcessor()
        print("\nProcessing collected news with NLP...")
        processed_df = nlp_processor.process_dataframe(latest_legislative_news_df)
        print("NLP processing completed.")

        # 3. Store NLP results in the database
        db_manager = DBManager()
        db_manager.connect()
        db_manager.create_table()  # Create table if it doesn't exist

        # Insert data with new NLP fields.
        # The insert_opportunities method currently accepts only initial fields.
        # For now, insert only the columns that exist and are supported.

        # Note: insert_opportunities has been modified to insert only the initial fields.
        # It will need to be upgraded later to handle NLP fields.
        # For now, we will display the NLP results directly.

        db_manager.insert_opportunities(processed_df)  # Will insert only recognized columns

        # Note: We should create a new method to update the NLP fields
        # or insert everything from the start if the table supports nulls.
        # For simplicity, we’re just inserting data as-is.

        # Retrieve and display all data from the database, including NLP fields
        print("\nAll current entries in the database (including NLP fields):")
        all_stored_data_df = db_manager.fetch_all_opportunities()
        if not all_stored_data_df.empty:
            # Display NLP-related fields if they exist
            display_columns = ['title', 'date', 'source', 'keywords', 'entities', 'main_topic', 'url']
            display(all_stored_data_df[[col for col in display_columns if col in all_stored_data_df.columns]].head(10))
            print(f"\nTotal of {len(all_stored_data_df)} entries found in the database.")
        else:
            print("No entries found in the database.")

        db_manager.close()

    except RuntimeError as e:
        print(f"Could not execute NLPProcessor: {e}")
        print("Make sure the Greek spaCy model is properly downloaded and loaded.")
else:
    print("\nNo new data found by the scrapers to store.")

print("\nProcess completed.")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/content/drive/My Drive/AI_Tax_Opportunity_Identifier
Το ελληνικό μοντέλο spaCy φορτώθηκε επιτυχώς.

--- Running Scraper, NLP Processor & Storing Data to the Database ---
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.minfin.gr/news
Επιτυχής ανάκτηση σελίδας https://www.minfin.gr/news
Βρέθηκαν 21 νέα από Υπουργείο Οικονομικών.
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.capital.gr/epikairotita
Επιτυχής ανάκτηση σελίδας https://www.capital.gr/epikairotita
Βρέθηκαν 17 νέα από Capital.gr.
Συνολικά βρέθηκαν 21 νέα/ανακοινώσεις από τις πηγές (με έγκυρη ημερομηνία).

Processing collected news with NLP...
NLP processing completed.
Successfully connected to the database: /content/drive/My Drive/AI_Tax_Opportunity_Identifier/data/tax_opportunities.db
Existing 'opportunities' table dropped (if it existed).
The 'opportunities' table has been created (or re-created) with the latest schema.
Insertion/up

Unnamed: 0,title,date,source,keywords,entities,main_topic,url
0,Νέο Διοικητικό Συμβούλιο στην Ελληνική Αναπτυξ...,2025-07-16,Υπουργείο Οικονομικών,"αναπτυξιακή, διοικητικός, ελληνικός, επενδύσεω...","[('Ελληνική Αναπτυξιακή Τράπεζα Επενδύσεων', '...",,https://minfin.gov.gr/neo-dioikitiko-symvoulio...
1,Γιώργος Κώτσηρας: Μειώνουμε τη γραφειοκρατία κ...,2025-07-15,Υπουργείο Οικονομικών,"βελτιώνουμε, γραφειοκρατία, επιχείρηση, καθημε...","[('Γιώργος Κώτσηρας', 'PERSON')]",,https://minfin.gov.gr/giorgos-kotsiras-meionou...
2,Διακήρυξη μειοδοτικής δημοπρασίας για τη μίσθω...,2025-07-15,Υπουργείο Οικονομικών,"ακινήτου, αποκεντρωμένης, δημοπρασία, διακήρυξ...",[('Αποκεντρωμένης Διοίκησης Μακεδονίας – Θράκη...,,https://minfin.gov.gr/diakiryxi-meiodotikis-di...
3,Κυριάκος Πιερρακάκης: «Η δημόσια περιουσία ανή...,2025-07-15,Υπουργείο Οικονομικών,"ανήκω, αξία, δημόσιος, κυριάκος, παράγω, περιο...","[('Κυριάκος Πιερρακάκης', 'PERSON')]",,https://minfin.gov.gr/kyriakos-pierrakakis-i-d...
4,Καθορισμός οριογραμμής παραλίας στην Τ.Κ. Σκου...,2025-07-10,Υπουργείο Οικονομικών,"ανατολικής, γυθείου, δ.ε, δήμου, καθορισμός, λ...","[('Τ.Κ', 'GPE'), ('Λακωνίας', 'GPE')]",,https://minfin.gov.gr/kathorismos-oriogrammis-...
5,Περίληψη διακήρυξης Δ’ επαναληπτικής μειοδοτικ...,2025-07-10,Υπουργείο Οικονομικών,"ακινήτου, αρκαδία, δ, δασών, δημοπρασία, διακή...",[],,https://minfin.gov.gr/perilipsi-diakiryxis-d-e...
6,Διακήρυξη Μειοδοτικής Δημοπρασίας Μίσθωσης Ακι...,2025-07-09,Υπουργείο Οικονομικών,"ακινήτου, δημοπρασία, διακήρυξη, διοίκηση, κατ...",[],,https://minfin.gov.gr/diakiryxi-meiodotikis-di...
7,Καθορισμός οριογραμμής παραλίας στη θέση Παγαν...,2025-07-09,Υπουργείο Οικονομικών,"ανατολικής, γυθείου, δ.ε, δήμου, θέση, καθορισ...","[('Παγανέα Τ.Κ', 'EVENT'), ('Λακωνίας', 'GPE')]",,https://minfin.gov.gr/kathorismos-oriogrammis-...
8,Ειδοποίηση για δικάσιμο κατά την 10η Οκτωβρίου...,2025-07-08,Υπουργείο Οικονομικών,"10/04, 15/00, 2025, έκταση, έξοδος, αίτησή, αί...","[('Ελληνικού Δημοσίου', 'ORG'), ('ΚΥΑ', 'ORG')...",,https://minfin.gov.gr/eidopoiisi-gia-dikasimo-...
9,Απόφαση καθορισμού οριογραμμών παραλίας και επ...,2025-07-03,Υπουργείο Οικονομικών,"αιγιαλού, ακτογραμμή, ανατολικής, απόφαση, δ.ε...","[('Κότρωνα', 'PERSON'), ('Χαλικιά Βάττα', 'PER...",,https://minfin.gov.gr/apofasi-kathorismou-orio...



Total of 19 entries found in the database.
Database connection closed.

Process completed.


opportunity_identifier.py CALL

In [6]:
import os
import sys
import pandas as pd  # <-- Add this line!

# Enable automatic module reloading (critical for changes in .py files)
%load_ext autoreload
%autoreload 2

# Ensure you are in the project's root directory
base_project_path = '/content/drive/My Drive/AI_Tax_Opportunity_Identifier'
%cd {base_project_path}

# Add the root folder to PATH so modules can be found
if base_project_path not in sys.path:
    sys.path.insert(0, base_project_path)

# Import Scraper, DBManager, NLPProcessor, and OpportunityIdentifier
from data_ingestion import legislative_scraper
from database.db_manager import DBManager
from nlp_processing.nlp_processor import NLPProcessor
from opportunity_identification.opportunity_identifier import OpportunityIdentifier

print("\n--- Running Scraper, NLP Processor, Opportunity Identifier & Storing Data to Database ---")

# 1. Run the Scraper
latest_legislative_news_df = legislative_scraper.get_latest_legislative_news()

# 2. Process data with NLP
processed_df = pd.DataFrame()  # Initialize in case no news is found
if not latest_legislative_news_df.empty:
    try:
        nlp_processor = NLPProcessor()
        print("\nProcessing collected news with NLP...")
        processed_df = nlp_processor.process_dataframe(latest_legislative_news_df)
        print("NLP processing completed.")
    except RuntimeError as e:
        print(f"Could not run NLPProcessor: {e}")
        print("Make sure the Greek spaCy model is properly downloaded and loaded.")
else:
    print("\nNo news found by the scrapers for processing.")

# 3. Identify and Score Opportunities
identified_opportunities_df = pd.DataFrame()  # Initialize
if not processed_df.empty:
    try:
        opportunity_identifier = OpportunityIdentifier()
        print("\nIdentifying and scoring opportunities...")
        identified_opportunities_df = opportunity_identifier.identify_and_score_opportunities(processed_df)
        print(f"Opportunity identification completed. Found {len(identified_opportunities_df)} opportunities.")
    except Exception as e:
        print(f"Error while running OpportunityIdentifier: {e}")
else:
    print("\nNo processed data available for opportunity identification.")

# 4. Store NLP and Opportunity data to the database
db_manager = DBManager()
db_manager.connect()
db_manager.create_table()  # Create table if it doesn't exist

if not processed_df.empty:
    # Insert/update the data (with NLP and Opportunity fields)
    for col in ['opportunity_score', 'opportunity_type']:
        if col not in processed_df.columns:
            processed_df[col] = None  # Or some default value

    db_manager.insert_opportunities(processed_df)
else:
    print("\nNo data available for insertion/update in the database.")

# 5. Retrieve and display all data from the database, including NLP and Opportunity fields
print("\nAll current data in the database (including NLP & Opportunity fields):")
all_stored_data_df = db_manager.fetch_all_opportunities()
if not all_stored_data_df.empty:
    # Show the most relevant columns and top 10 opportunities
    display_columns = ['title', 'date', 'source', 'opportunity_score', 'opportunity_type',
                       'keywords', 'entities', 'main_topic', 'url']

    # Filter to show only real opportunities (score > 0)
    # and sort by score
    display_opportunities = all_stored_data_df[all_stored_data_df['opportunity_score'] > 0].copy()
    display_opportunities = display_opportunities.sort_values(by='opportunity_score', ascending=False)

    print(f"\nTop {min(10, len(display_opportunities))} identified opportunities:")
    display(display_opportunities[[col for col in display_columns if col in display_opportunities.columns]].head(10))

    print(f"\nA total of {len(all_stored_data_df)} entries are stored in the database.")
else:
    print("No entries found in the database.")

db_manager.close()

print("\nProcess completed.")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/content/drive/My Drive/AI_Tax_Opportunity_Identifier

--- Running Scraper, NLP Processor, Opportunity Identifier & Storing Data to Database ---
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.minfin.gr/news
Επιτυχής ανάκτηση σελίδας https://www.minfin.gr/news
Βρέθηκαν 21 νέα από Υπουργείο Οικονομικών.
Προσπαθώ να ανακτήσω περιεχόμενο από: https://www.capital.gr/epikairotita
Επιτυχής ανάκτηση σελίδας https://www.capital.gr/epikairotita
Βρέθηκαν 17 νέα από Capital.gr.
Συνολικά βρέθηκαν 21 νέα/ανακοινώσεις από τις πηγές (με έγκυρη ημερομηνία).

Processing collected news with NLP...
NLP processing completed.

Identifying and scoring opportunities...
Opportunity identification completed. Found 1 opportunities.
Successfully connected to the database: /content/drive/My Drive/AI_Tax_Opportunity_Identifier/data/tax_opportunities.db
Existing 'opportunities' table dropped (if it existed).
The 'opportunities

Unnamed: 0,title,date,source,opportunity_score,opportunity_type,keywords,entities,main_topic,url
10,Ειδοποίηση για δικάσιμο κατά την 23η Σεπτεμβρί...,2025-06-30,Υπουργείο Οικονομικών,2.0,Γενική Φορολογική Είδηση,"2882/2001, άμεσος, άρθρο, έλκω, έργα, έργο, αί...","[('Ελληνικού Δημοσίου', 'ORG'), ('άρθρου 7Α', ...",,https://minfin.gov.gr/eidopoiisi-gia-dikasimo-...



A total of 19 entries are stored in the database.
Database connection closed.

Process completed.


GUI INTERFACE

In [9]:
import os
import sys
import pandas as pd
from pyngrok import ngrok
import subprocess
import time
import importlib
from datetime import datetime, date # Needed for scraper functions
import requests # Needed for scraper functions
from bs4 import BeautifulSoup # Needed for scraper functions
import socket # Needed for port checking
import re # Added: Import the 're' module for regular expressions
from google.colab import data_table # Added: Import data_table

# Kill any existing ngrok tunnels to ensure a clean start
ngrok.kill()
print("Existing ngrok tunnels killed.")

# Enable auto-reloading of modules (crucial for changes to .py files)
%load_ext autoreload
%autoreload 2

# Ensure you are in the project's root folder
base_project_path = '/content/drive/My Drive/AI_Tax_Opportunity_Identifier'
%cd {base_project_path}
print(f"Current working directory: {os.getcwd()}")

# Add the project root to the PATH for module discovery
if base_project_path not in sys.path:
    sys.path.insert(0, base_project_path)
    print("Project root added to sys.path.")

# --- Reload config and import other modules ---
# These modules are imported and reloaded here so the Streamlit app can access them
# via their instances (e.g., nlp_processor_instance)
import config
importlib.reload(config)
print("Config module reloaded.")

from nlp_processing import nlp_processor
importlib.reload(nlp_processor)
print("NLP Processor module reloaded.")

from database import db_manager
importlib.reload(db_manager)
print("DB Manager module reloaded.")

from opportunity_identification import opportunity_identifier
importlib.reload(opportunity_identifier)
print("Opportunity Identifier module reloaded.")

# --- SCRAPER FUNCTIONS (MOVED HERE TO GLOBAL SCOPE) ---
# These functions are defined here so they are directly available to the Streamlit app's run_pipeline function
# without complex import issues.
def fetch_page_content(url, headers=None):
    """
    Retrieves the HTML content of a webpage.
    """
    if headers is None:
        headers = config.HEADERS
    print(f"Attempting to retrieve content from: {url}")
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        print(f"Successfully retrieved page {url}")
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving page {url}: {e}")
        return None

def parse_minfin_news(html_content):
    """
    Parses the Ministry of Finance news page and extracts information.
    """
    if not html_content:
        return []
    soup = BeautifulSoup(html_content, 'lxml')
    news_entries = []
    articles = soup.find_all('article', class_=lambda x: x and 'elementor-post' in x.split() and 'elementor-grid-item' in x.split())
    if not articles:
        print("Note: No 'article.elementor-post.elementor-grid-item' found on Ministry of Finance page.")
        return []
    for article in articles:
        title_tag = article.find('h3', class_='elementor-post__title')
        link_tag = None
        if title_tag:
            link_tag = title_tag.find('a', href=True)
        date_tag = article.find('span', class_='elementor-post-date')
        if link_tag and title_tag and date_tag:
            title = title_tag.get_text(strip=True)
            link = link_tag['href']
            date_str = date_tag.get_text(strip=True)
            news_entries.append({
                'title': title, 'url': link, 'date': date_str, 'source': 'Ministry of Finance'
            })
    return news_entries

def parse_aade_news(html_content):
    """
    Parses the AADE press releases page and extracts information.
    """
    if not html_content:
        return []
    soup = BeautifulSoup(html_content, 'lxml')
    news_entries = []
    items = soup.find_all('div', class_='views-row')
    if not items:
        print("Note: No 'div.views-row' found on AADE page.")
        return []
    for item in items:
        date_span = item.find('span', class_='field-content')
        link_tag = item.find('a', class_='category-item-title', href=True)
        title_text_element = item.find('p')
        if date_span and link_tag:
            date_str = date_span.get_text(strip=True)
            title = link_tag.get_text(strip=True) if link_tag else (title_text_element.get_text(strip=True) if title_text_element else "N/A Title")
            link = requests.compat.urljoin(config.AADE_NEWS_URL, link_tag['href'])
            news_entries.append({
                'title': title, 'url': link, 'date': date_str, 'source': 'AADE'
            })
    return news_entries

def parse_capital_news(html_content):
    """
    Parses the Capital.gr news page and extracts information.
    """
    if not html_content:
        return []
    soup = BeautifulSoup(html_content, 'lxml')
    news_entries = []
    articles = soup.find_all('div', class_=lambda x: x and 'article' in x.split() and 'snip' in x.split())
    if not articles:
        print("Note: No 'div.article.snip' found on Capital.gr page.")
        return []
    for article in articles:
        title_h2 = article.find('h2', class_='bold')
        link_tag = None
        if title_h2:
            link_tag = title_h2.find('a', href=True)
        date_span = article.find('span', class_='date')
        time_span = article.find('span', class_='time')
        if link_tag and date_span:
            title = link_tag.get_text(strip=True)
            link = requests.compat.urljoin(config.CAPITAL_NEWS_URL, link_tag['href'])
            date_str = date_span.get_text(strip=True)
            time_str = time_span.get_text(strip=True) if time_span else "00:00"
            full_date_str = f"{date_str} {time_str}"
            news_entries.append({
                'title': title, 'url': link, 'date': full_date_str, 'source': 'Capital.gr'
            })
    return news_entries

def get_latest_legislative_news(filter_by_current_date=False):
    """
    Collects the latest legislative news and announcements from all sources.
    If filter_by_current_date is True, returns only news from the current date.
    """
    all_news_data = []
    today = date.today()

    minfin_html = fetch_page_content(config.MINISTRY_FINANCE_NEWS_URL, headers=config.HEADERS)
    if minfin_html:
        minfin_data = parse_minfin_news(minfin_html)
        if minfin_data:
            print(f"Found {len(minfin_data)} news items from Ministry of Finance.")
            all_news_data.extend(minfin_data)
        else:
            print("No news found from Ministry of Finance with current analysis. Check selectors in parse_minfin_news.")

    # AADE is still commented out due to 403 issues. Uncomment if AADE is fixed.
    # aade_html = fetch_page_content(config.AADE_NEWS_URL, headers=config.HEADERS)
    # if aade_html:
    #     aade_data = parse_aade_news(aade_html)
    #     if aade_data:
    #         print(f"Found {len(aade_data)} news items from AADE.")
    #         all_news_data.extend(aade_data)
    #     else:
    #         print("No news found from AADE with current analysis. Check selectors in parse_aade_news.")
    # else:
    #     print(f"Note: Failed to retrieve AADE page. 403 Forbidden is still possible.")

    capital_html = fetch_page_content(config.CAPITAL_NEWS_URL, headers=config.HEADERS)
    if capital_html:
        capital_data = parse_capital_news(capital_html)
        if capital_data:
            print(f"Found {len(capital_data)} news items from Capital.gr.")
            all_news_data.extend(capital_data)
        else:
            print("No news found from Capital.gr with current analysis. Check selectors in parse_capital_news.")

    if all_news_data:
        df = pd.DataFrame(all_news_data)
        df['id'] = df['url']

        greek_month_map = {
            'Ιανουαρίου': 'January', 'Φεβρουαρίου': 'February', 'Μαρτίου': 'March',
            'Απριλίου': 'April', 'Μαΐου': 'May', 'Ιουνίου': 'June',
            'Ιουλίου': 'July', 'Αυγούστου': 'August', 'Σεπτεμβρίου': 'September',
            'Οκτωβρίου': 'October', 'Νοεμβρίου': 'November', 'Δεκεμβρίου': 'December',
            'Ιαν': 'Jan', 'Φεβ': 'Feb', 'Μαρ': 'Mar', 'Απρ': 'Apr', 'Μαϊ': 'May',
            'Ιουν': 'Jun', 'Ιουλ': 'Jul', 'Αυγ': 'Aug', 'Σεπ': 'Sep', 'Οκτ': 'Oct',
            'Νοε': 'Nov', 'Δεκ': 'Dec'
        }

        date_formats = [
            '%d %B %Y',
            '%d/%m/%Y %H:%M',
            '%d/%m/%Y',
            '%d.%m.%Y',
            '%Y-%m-%d',
            '%d/%m',
            '%Y/%m/%d'
        ]

        df['date'] = df['date'].astype(str)

        def parse_date_robust(date_str):
            if pd.isna(date_str) or not date_str:
                return pd.NaT
            date_str_processed = str(date_str)
            for greek, english in greek_month_map.items():
                if greek in date_str_processed:
                    date_str_processed = date_str_processed.replace(greek, english)
            if re.search(r'^\d{2}/\d{2} \d{2}:\d{2}$', date_str_processed):
                if not re.search(r'\d{4}', date_str_processed):
                    current_year = pd.Timestamp.now().year
                    date_str_processed = f"{date_str_processed}/{current_year}"
            elif re.search(r'^\d{2}/\d{2}$', date_str_processed):
                if not re.search(r'\d{4}', date_str_processed):
                    current_year = pd.Timestamp.now().year
                    date_str_processed = f"{date_str_processed}/{current_year}"
            for fmt in date_formats:
                try:
                    return pd.to_datetime(date_str_processed, format=fmt, errors='raise')
                except (ValueError, TypeError):
                    continue
            return pd.NaT

        df['date'] = df['date'].apply(parse_date_robust)
        df = df.dropna(subset=['date'])

        if not df.empty:
            df['date'] = df['date'].dt.date
            if filter_by_current_date:
                df = df[df['date'] == today].copy()
                if df.empty:
                    print(f"No news found for the current date ({today}).")
                else:
                    print(f"Found {len(df)} news items for the current date ({today}).")

            df = df.sort_values(by='date', ascending=False).reset_index(drop=True)
            print(f"Total {len(df)} news/announcements found from sources (with valid date).")
            return df
        else:
            print("No valid date data found after parsing.")
            return pd.DataFrame()
    else:
        print("No news/announcements found from any source.")
        return pd.DataFrame()

# --- END SCRAPER FUNCTIONS ---

# Initialize classes
db_manager_instance = db_manager.DBManager()
nlp_processor_instance = nlp_processor.NLPProcessor()
opportunity_identifier_instance = opportunity_identifier.OpportunityIdentifier()

print("\n--- Executing Scraper, NLP Processor, Opportunity Identifier & Saving Data to Database ---")

# 1. Run Scraper (now calling the function directly)
# Call get_latest_legislative_news with filter_by_current_date=True
latest_legislative_news_df = get_latest_legislative_news(filter_by_current_date=True)

# 2. Process data with NLP
processed_df = pd.DataFrame()
if not latest_legislative_news_df.empty:
    try:
        nlp_processor = nlp_processor.NLPProcessor() # Re-initialize to ensure it uses latest config
        print("\nProcessing collected news with NLP...")
        processed_df = nlp_processor.process_dataframe(latest_legislative_news_df)
        print("NLP processing completed.")
    except RuntimeError as e:
        print(f"Could not execute NLPProcessor: {e}")
        print("Ensure the Greek spaCy model is downloaded and loaded correctly.")
else:
    print("\nNo news found from scrapers for processing.")

# 3. Identify and Score Opportunities
identified_opportunities_df = pd.DataFrame()
if not processed_df.empty:
    try:
        opportunity_identifier = opportunity_identifier.OpportunityIdentifier() # Re-initialize
        print("\nIdentifying and scoring opportunities...")
        identified_opportunities_df = opportunity_identifier.identify_and_score_opportunities(processed_df)
        print(f"Opportunity identification completed. Found {len(identified_opportunities_df)} opportunities.")
    except Exception as e:
        print(f"Error executing OpportunityIdentifier: {e}")
else:
    print("\nNo processed data for opportunity identification.")


# 4. Save NLP and Opportunities to the database
db_manager_instance = db_manager.DBManager() # Re-initialize
db_manager_instance.connect()
db_manager_instance.create_table() # Create table (will drop if exists and recreate)

if not processed_df.empty:
    for col in ['opportunity_score', 'opportunity_type']:
        if col not in processed_df.columns:
            processed_df[col] = None

    db_manager_instance.insert_opportunities(processed_df)
else:
    print("\nNo data to insert/update in the database.")


# 5. Retrieve and display all data from the database, including NLP and Opportunity fields
print("\nAll data currently in the database (with NLP & Opportunity fields):")
all_stored_data_df = db_manager_instance.fetch_all_opportunities()
if not all_stored_data_df.empty:
    display_columns = ['title', 'date', 'source', 'opportunity_score', 'opportunity_type', 'keywords', 'entities', 'main_topic', 'url']

    display_opportunities = all_stored_data_df[all_stored_data_df['opportunity_score'] > 0].copy()
    display_opportunities = display_opportunities.sort_values(by='opportunity_score', ascending=False)

    print(f"\nTop {min(10, len(display_opportunities))} identified opportunities:")
    # Use data_table.DataTable for interactive display
    data_table.DataTable(display_opportunities[[col for col in display_columns if col in display_opportunities.columns]].head(10), include_index=False)

    print(f"\nTotal {len(all_stored_data_df)} entries in the database.")
else:
    print("No entries found in the database.")

db_manager_instance.close()

print("\nProcess completed.")

Existing ngrok tunnels killed.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/content/drive/My Drive/AI_Tax_Opportunity_Identifier
Current working directory: /content/drive/My Drive/AI_Tax_Opportunity_Identifier
Config module reloaded.
Το ελληνικό μοντέλο spaCy φορτώθηκε επιτυχώς.
NLP Processor module reloaded.
DB Manager module reloaded.
Opportunity Identifier module reloaded.

--- Executing Scraper, NLP Processor, Opportunity Identifier & Saving Data to Database ---
Attempting to retrieve content from: https://www.minfin.gr/news
Successfully retrieved page https://www.minfin.gr/news
Found 21 news items from Ministry of Finance.
Attempting to retrieve content from: https://www.capital.gr/epikairotita
Successfully retrieved page https://www.capital.gr/epikairotita
Found 17 news items from Capital.gr.
Found 1 news items for the current date (2025-07-16).
Total 1 news/announcements found from sources (with valid date).

Processing collected news 