## Importing Data and Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyfume.Clustering import Clusterer
from pyfume.EstimateAntecendentSet import AntecedentEstimator
from pyfume.EstimateConsequentParameters import ConsequentEstimator
from pyfume.SimpfulModelBuilder import SugenoFISBuilder
from pyfume.Tester import SugenoFISTester
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from numpy import clip, column_stack, argmax
from scipy.spatial import distance
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
Train = pd.read_csv('dataTrain.csv')
Test = pd.read_csv('dataTest.csv')

stats = pd.read_csv('stats.csv')
stats = stats.iloc[:,1:]

X_train = Train.drop('output', axis=1)
y_train = Train['output']

X_test = Test.drop('output', axis=1)
y_test = Test['output']

maxs = X_train.max().tolist()
mins = X_train.min().tolist()

var_names = X_train.columns.to_list()
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
stats

In [None]:
savecols = X_train.columns

In [None]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.array
y_test = y_test.array

## Building the model

### Clustering

In this section, we will try out different numbers of clusters in order to minimize the separation between clusters. 

#### Functions

In [None]:
def SeparationMetric(X_train, clust_centers, part_matrix):
    metric = 0
    for clust in range(clust_centers.shape[0]):
        for point in range(X_train.shape[0]):
            metric += part_matrix[point, clust] * distance.euclidean(X_train[point], clust_centers[clust])
    return metric

#def SeparationMetric(X_train, clust_centers, part_matrix):
#    silhouette_score(X_train, )
#    return metric

In [None]:
# Since pyfume's clustering algorithm was using the the target, I made some modifcations to their fcm
# code as seen below

def fcm(data, n_clusters, m=2, max_iter=1000, error=0.005):
        #data: 2d array, size (N, S). N is the number of instances; S is the number of variables
        #n_clusters: number of clusters
        #m: fuzzy clustering coefficient
        #max_it: maximum number of iterations, default=1000
        #error: stopping criterion, default=0.005
        #seed: seed for random initialization of u matrix
        
    n_instances = data.shape[0]
        
    #randomly initaliaze u
    np.random.seed(1231241421)
    u = np.random.rand(n_instances, n_clusters)
    u = np.fmax(u, np.finfo(np.float64).eps)
    ut = u.T
        
    for it in range(0,max_iter):
        #copy old u matrix
        u_old = ut.copy()
        u_old /= np.ones((n_clusters, 1)).dot(np.atleast_2d(u_old.sum(axis=0)))
        u_old = np.fmax(u_old, np.finfo(np.float64).eps)
        
        #elevate to m
        um = u_old ** m
        
        #calculate cluster centers
        centers = um.dot(data) / (np.ones((data.shape[1], 1)).dot(np.atleast_2d(um.sum(axis=1))).T)
        
        #calculate distances
        dist = cdist(centers, data, metric='euclidean')
        dist = np.fmax(dist, np.finfo(np.float64).eps)
        
        #calculate objective
        jm = (um * dist ** 2).sum()
    
        #calculate new u matrix
        ut = dist ** (- 2. / (m - 1))
        ut /= np.ones((n_clusters, 1)).dot(np.atleast_2d(ut.sum(axis=0)))
    
        #stopping criterion
        if np.linalg.norm(ut - u_old) < error:
            break
    
    partition_matrix = ut.T
    return centers, partition_matrix

In [None]:
def Cluster(nclusters):
    clust_centers, part_matrix = fcm(X_train, n_clusters = nclusters)
    return clust_centers, part_matrix

In [None]:
def gaussian(x, mu, sig):
    #return (1.0 / (np.sqrt(2.0 * np.pi) * sig) * np.exp(-np.power((x - mu) / sig, 2.0) / 2))
    return (np.exp(-np.power((x - mu) / sig, 2.0) / 2))

#### Code

In [None]:
max_clusters = 2
min_metric = np.inf

for i in range(max_clusters):
    i += 1
    clust_centers, part_matrix = Cluster(i)
    metric = SeparationMetric(X_train, clust_centers, part_matrix)
    if metric < min_metric:
        min_metric = metric
        best_number = i

print(f'Best number of clusters: {best_number}')

In [None]:
clust_centers, part_matrix = Cluster(best_number)
clust_centers.shape, part_matrix.shape

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming Cluster and SeparationMetric functions are defined elsewhere

max_clusters = 2
min_metric = np.inf
metrics = []  # List to store metrics for each number of clusters

# Loop through different numbers of clusters
for i in range(1, max_clusters + 1):
    clust_centers, part_matrix = Cluster(i)
    metric = SeparationMetric(X_train, clust_centers, part_matrix)
    
    metrics.append(metric)  # Store the metric for the current number of clusters
    
    if metric < min_metric:
        min_metric = metric
        best_number = i

# Print the best number of clusters
print(f'Best number of clusters: {best_number}')

# Plotting the evolution of metrics
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_clusters + 1), metrics, marker='o', linestyle='-', color='b')
plt.title('Evolution of Separation Metric Over Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Separation Metric')
plt.grid()
plt.axvline(best_number, color='r', linestyle='--', label=f'Best Number of Clusters: {best_number}')
plt.legend()
plt.show()


### Building the model

In [None]:
ae = AntecedentEstimator(X_train, part_matrix)
antecedent_params = ae.determineMF()

ce = ConsequentEstimator(X_train, y_train, part_matrix)
conseq_params = ce.suglms()

modbuilder = SugenoFISBuilder(antecedent_params, conseq_params, var_names, save_simpful_code=False)
model = modbuilder.get_model()

### Inference and Metrics

In [None]:
modtester = SugenoFISTester(model, X_test, var_names)
y_pred_probs = clip(modtester.predict()[0], 0, 1)
y_pred_probs = column_stack((1 - y_pred_probs, y_pred_probs))
y_pred = argmax(y_pred_probs,axis=1)

In [None]:
acc_score = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}".format(acc_score))
rec_score = recall_score(y_test, y_pred)
print("Recall: {:.3f}".format(rec_score))
prec_score = precision_score(y_test, y_pred)
print("Precision Score: {:.3f}".format(prec_score))
F1_score = f1_score(y_test, y_pred)
print("F1-Score: {:.3f}".format(F1_score))
kappa = cohen_kappa_score(y_test, y_pred)
print("Kappa Score: {:.3f}".format(kappa))

## Model Visualization/Interpretation

In [None]:
model._lvs['chol'] # example of how pyFUME defines une linguistic variable in simpful

In [None]:
model.get_rules() # print the rules associated with the model

In [None]:
# The pyFUME library doesn't atribute a universe of discourse to the linguistic variables it creates in simpful. In order to fix this, we set each universe considering the maximum
# value heald by each dataframe column. This has to be done since, otherwise, the plotting functions won't work.

for ix in range(len(var_names)):
    max = maxs[ix]
    min = mins[ix]
    uod = [min, max]
    model._lvs[var_names[ix]]._universe_of_discourse = uod

In [None]:
model.produce_figure("",2)

## Converting the model to X

In [None]:
def FunctionstoX(model, stats):
    final_df = pd.DataFrame(columns = var_names+['constant'])

    for f in model._outputfunctions:
        acumulated = 0 # value to be added to the intercept in the end
        expression = model._outputfunctions[f]
        item_list = expression.split('+')

        for k in range(len(item_list) - 1):
            item_list[k] = item_list[k].split('*')
            values = stats[stats['Feature'] == item_list[k][1]]
            std = values.iloc[0,2] ** 0.5
            mean = values.iloc[0,1]
            acumulated += float(item_list[k][0]) * mean
            item_list[k] = float(item_list[k][0]) * (std) 
            
        item_list[-1] = float(item_list[-1]) + acumulated
        final_df = pd.concat([final_df,pd.DataFrame([item_list], columns = var_names+['constant'])], axis = 0)
        final_df.reset_index(drop=True, inplace=True)
    return final_df

In [None]:
def UODtoX(model, stats):
    uods = {}
    for lv in model._lvs:
        data = stats[stats['Feature'] == lv]
        mean = data.iloc[0,1]
        std = data.iloc[0,2] ** 0.5
        uod = model._lvs[lv]._universe_of_discourse
        for i in range(2):
            uod[i] = uod[i] * std + mean
        uods[lv] = uod
    return uods

In [None]:
class XMF:

    def __init__(self, uod, data, mu, sigma):
        self.uod = uod
        self.mean = data.iloc[0,1]
        self.std = data.iloc[0,2] ** 0.5
        self.mu = mu
        self.sigma = sigma

    def __call__(self, x):
        
        x = (x - self.mean) / self.std 
        x = gaussian(x, self.mu, self.sigma)
        
        return x

In [None]:
def newMFS(model, stats, uods):
    MFS = {}
    for lv in model._lvs:
        fs = []
        data = stats[stats['Feature'] == lv]
        uod = uods[lv]
        for clust in range(len(model._lvs[lv]._FSlist)):
            mu = model._lvs[lv]._FSlist[clust]._funpointer._mu
            sigma = model._lvs[lv]._FSlist[clust]._funpointer._sigma
            fs.append(XMF(uod, data, mu, sigma))
        MFS[lv] = fs
    return MFS

In [None]:
def plotMFs(mfs, var):
    mfs = mfs[var]
    x = np.linspace(mfs[0].uod[0], mfs[0].uod[1], 100)
    for i in range(len(mfs)):
        y = [mfs[i](j) for j in x]
        plt.plot(x, y)

In [None]:
uods = UODtoX(model, stats)
mfs = newMFS(model, stats, uods)
consequents = FunctionstoX(model, stats)

### Estado Atual

In [None]:
uods

In [None]:
plotMFs(mfs, 'thalachh')

In [None]:
consequents

In [None]:
# duvidas

# A soma dos valores obtidos pelos consequentes é igual a uma probabilidade?