In [4]:
import pandas as pd
import random
import sys
# In case you run this without installing the package, you need to add the path to the package
# This is for launching from root folder path
sys.path.append('./ex_fuzzy/')
sys.path.append('./ex_fuzzy/ex_fuzzy/')
# This is for launching from Demos folder
sys.path.append('../ex_fuzzy/')
sys.path.append('../ex_fuzzy/ex_fuzzy/')

In [5]:
import ex_fuzzy.fuzzy_sets as fs
import ex_fuzzy.evolutionary_fit as GA
import ex_fuzzy.utils as  utils
import ex_fuzzy.eval_tools as eval_tools
import ex_fuzzy.pattern_stability as pattern_stability

from sklearn import datasets
from sklearn.model_selection import train_test_split

In [6]:
import os
os.getcwd()

'c:\\Users\\javi-\\OneDrive\\Documentos\\GitHub\\ex-fuzzy\\Demos'

In [None]:
# Data taken from: https://www.kaggle.com/datasets/nilimajauhari/glassdoor-analyze-gender-pay-gap
df = pd.read_csv('./paygap data/pay_gap.csv')
df.head()

Unnamed: 0,JobTitle,Gender,Age,PerfEval,Education,Dept,Seniority,BasePay,Bonus
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938
1,Software Engineer,Male,21,5,College,Management,5,108476,11128
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319


In [8]:
X = df.drop(columns=['Gender'])
y = df['Gender']

In [9]:
# Factorize all object-type variables
import numpy as np

categorical_mask = np.zeros(X.shape[1], dtype=int)
for i, column in enumerate(X.columns):
    if X[column].dtype == 'object':
        _, unique_classes = pd.factorize(X[column])
        categorical_mask[i] = len(unique_classes)
        print(f"Column '{column}' unique classes: {unique_classes.tolist()}")

random.seed(2024)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

Column 'JobTitle' unique classes: ['Graphic Designer', 'Software Engineer', 'Warehouse Associate', 'IT', 'Sales Associate', 'Driver', 'Financial Analyst', 'Marketing Associate', 'Data Scientist', 'Manager']
Column 'Education' unique classes: ['College', 'PhD', 'Masters', 'High School']
Column 'Dept' unique classes: ['Operations', 'Management', 'Administration', 'Sales', 'Engineering']


In [10]:
fz_type_studied = fs.FUZZY_SETS.t1  # T1 fuzzy sets
n_linguistic_variables = 3  # Define the number of linguistic variables
precomputed_partitions = utils.construct_partitions(X, fz_type_studied, n_partitions=n_linguistic_variables, categorical_mask=categorical_mask)

n_gen = 5
n_pop = 30
n_rules = 20

In [11]:
# Train the fuzzy rules classifier
fl_classifier = GA.BaseFuzzyRulesClassifier(nRules=n_rules, 
                                           linguistic_variables=precomputed_partitions,
                                            #linguistic_variables = None,
                                           nAnts=3, 
                                           n_linguistic_variables=n_linguistic_variables, 
                                           fuzzy_type=fz_type_studied, 
                                           verbose=True, 
                                           tolerance=0.01, 
                                           runner=1, 
                                           ds_mode=1,
                                            #allow_unknown=True,
                                           fuzzy_modifiers=False)

fl_classifier.fit(X_train, y_train, n_gen=n_gen, pop_size=n_pop)

rule_base = fl_classifier.get_rulebase()
fl_evaluator = eval_tools.FuzzyEvaluator(fl_classifier)
str_rules = fl_evaluator.eval_fuzzy_model(X_train, y_train, X_test, y_test, 
                        plot_rules=False, print_rules=True, plot_partitions=False, return_rules=False)

n_gen  |  n_eval  |     f_avg     |     f_min    
     1 |       30 |  0.9972697353 |  0.9015408163
     2 |       60 |  0.9468143667 |  0.8726202543
     3 |       90 |  0.9096668080 |  0.8726202543
     4 |      120 |  0.8874699274 |  0.8726202543
     5 |      150 |  0.8725060685 |  0.8145481170
------------
ACCURACY
Train performance: 0.5074626865671642
Test performance: 0.47575757575757577
------------
MATTHEW CORRCOEF
Train performance: 0.18545188302395502
Test performance: 0.0675264356106885
------------
Rules for consequent: Female
----------------
IF Age IS Medium AND Dept IS Engineering WITH DS 0.02070060147026064, ACC 0.47755834829443444, WGHT 1.0
IF Age IS Medium AND Education IS High School WITH DS 0.030314101136110494, ACC 0.5568181818181818, WGHT 1.0

Rules for consequent: Male
----------------
IF JobTitle IS Software Engineer AND Age IS Medium WITH DS 0.017951659612385828, ACC 1.0, WGHT 1.0


