In [1]:
import os
import sys
import json
from rich.console import Console
from rich.text import Text
from rich.theme import Theme
custom_theme = Theme ({"success": "bold green", "error": "bold red"})
console = Console(theme = custom_theme)

#------------------------LOGGER SETUP!---------------------
from utils.logger_utils import *

# Setting up the logging
setup_logging()
logger = get_logger(__name__)

logger.info("Starting pipeline")
#=============================================================


# in order to recognize the utils, I had to add project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.append(project_root)


from preprocessing.preprocessing_pipeline import *
from utils.cfd_utils import *
from CFDs.CFDs_pipeline import *
from FAIR_DB.FAIR_DB import *


def get_project_root():
    try:
        return os.path.abspath(os.path.dirname(__file__))
    except NameError:
        return os.path.abspath(os.getcwd())

project_root = get_project_root()
prep_config_path = os.path.join(project_root, 'configs', 'prep_config.json')
cfd_config_path = os.path.join(project_root, 'configs', 'CFD_config.json')
cleaned_data_for_cfd = pd.read_csv('outputs/cleaned_data_for_cfd.csv')
input_csv = os.path.join(project_root, 'outputs', 'cleaned_data_for_cfd.csv')
output_txt = os.path.join(project_root, 'outputs', 'raw_cfd.txt')
config_FAIR_DB_path = os.path.join(project_root, 'configs', 'FAIR_DB.json')

with open(prep_config_path) as f:
    prep_config = json.load(f)


with open(cfd_config_path, 'r') as f:
     config_CFD = json.load(f)

with open(config_FAIR_DB_path, 'r') as f:
     config_FAIR_DB = json.load(f)

protected_attributes = config_CFD["CFDParsing"]["protected_attributes"]
target_attribute = config_CFD["CFDParsing"]["target_attribute"]
condlhs  = config_CFD["CFDParsing"]["conditionslhs"]
condrhs  = config_CFD["CFDParsing"]["conditionsrhs"]

#used for saving plots!
output_dir = os.path.join(project_root, "plots")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Extract minDiff from config
min_diff = config_FAIR_DB.get("FAIR_DB", {}).get("minDiff", 0.07)




console.print("1- Process started...", style= "success")

#=========================================================add here==============================================================
console.print("2- Preprocessing started...", style= "success")
df_cleaned = run_preprocessing(prep_config)

console.print("3- CFDDiscovery started...", style= "success")
raw_cfds = run_full_cfd_discovery(cfd_config_path, input_csv, output_txt)
dict_cfds = filter_cfds(raw_cfds, config_CFD, condlhs, condrhs)

console.print("4- FAIR_DB started...", style= "success")
FAIR_DB = run_FAIR_DB(dict_cfds, cleaned_data_for_cfd, protected_attributes, target_attribute, min_diff)
FAIR_DB.head(13)

#============================================================================================================================

console.print("Pipeline completed successfully! :thumbs_up:", style="success")



Unnamed: 0,workclass,race,sex,native-country,income,age_range,hours-per-week_range,education-degree
0,state-gov,white,male,united-states,<=50k,30-45,21-40,Bach
1,self-emp-not-inc,white,male,united-states,<=50k,45-60,0-20,Bach
2,private,white,male,united-states,<=50k,30-45,21-40,HS-College
3,private,black,male,united-states,<=50k,45-60,21-40,MiddleSchool
4,private,black,female,cuba,<=50k,15-30,21-40,Bach
5,private,white,female,united-states,<=50k,30-45,21-40,Mast
6,private,black,female,jamaica,<=50k,45-60,0-20,MiddleSchool
7,self-emp-not-inc,white,male,united-states,>50k,45-60,41-60,HS-College
8,private,white,female,united-states,>50k,30-45,41-60,Mast
9,private,white,male,united-states,>50k,30-45,21-40,Bach


CFD discovery algorithm completed successfully! Output saved to c:\Users\deart\Desktop\Tesi\My_Work\outputs\raw_cfd.txt
Total number of dependencies in the dictionary: 96
The first 50 dependencies:

1) {'lhs': {'education-degree': 'Elementary'}, 'rhs': {'income': '<=50k'}}
2) {'lhs': {'education-degree': 'MiddleSchool'}, 'rhs': {'income': '<=50k'}}
3) {'lhs': {'age_range': '15-30'}, 'rhs': {'income': '<=50k'}}
4) {'lhs': {'age_range': '15-30', 'education-degree': 'Assoc'}, 'rhs': {'income': '<=50k'}}
5) {'lhs': {'education-degree': 'MiddleSchool', 'age_range': '15-30'}, 'rhs': {'income': '<=50k'}}
6) {'lhs': {'education-degree': 'HS-College', 'age_range': '15-30'}, 'rhs': {'income': '<=50k'}}
7) {'lhs': {'hours-per-week_range': '0-20'}, 'rhs': {'income': '<=50k'}}
8) {'lhs': {'hours-per-week_range': '0-20', 'education-degree': 'Assoc'}, 'rhs': {'income': '<=50k'}}
9) {'lhs': {'hours-per-week_range': '21-40', 'education-degree': 'MiddleSchool'}, 'rhs': {'income': '<=50k'}}
10) {'lhs': {

Unnamed: 0,Rule,Support,Confidence,Diff,sexDiff,raceDiff
0,"{'lhs': {'sex': 'female'}, 'rhs': {'income': '<=50k'}}",0.294526,0.890448,0.131374,0.131374,
1,"{'lhs': {'native-country': 'united-states', 'sex': 'female'}, 'rhs': {'income': '<=50k'}}",0.264437,0.88921,0.13512,0.13512,
6,"{'lhs': {'sex': 'female', 'education-degree': 'Assoc'}, 'rhs': {'income': '<=50k'}}",0.093371,0.920048,0.121687,0.121687,
7,"{'lhs': {'education-degree': 'HS-College', 'sex': 'female'}, 'rhs': {'income': '<=50k'}}",0.101484,0.934881,0.0911,0.0911,
11,"{'lhs': {'hours-per-week_range': '21-40', 'sex': 'female'}, 'rhs': {'income': '<=50k'}}",0.207641,0.903934,0.093034,0.093034,
17,"{'lhs': {'race': 'black'}, 'rhs': {'income': '<=50k'}}",0.084058,0.876041,0.116967,,0.116967
19,"{'lhs': {'native-country': 'united-states', 'race': 'black'}, 'rhs': {'income': '<=50k'}}",0.076221,0.876016,0.121925,,0.121925
24,"{'lhs': {'education-degree': 'HS-College', 'race': 'black'}, 'rhs': {'income': '<=50k'}}",0.035406,0.92679,0.083009,,0.083009
32,"{'lhs': {'hours-per-week_range': '21-40', 'race': 'black'}, 'rhs': {'income': '<=50k'}}",0.066079,0.895833,0.084933,,0.084933
42,"{'lhs': {'race': 'black', 'sex': 'female'}, 'rhs': {'income': '<=50k'}}",0.045026,0.942122,0.183048,0.066081,0.051674
