# Estratégia de Desenvolvimento de Modelos

### Nesta etapa procuramos construir, otimizar e avaliar modelos de aprendizado de máquina para uma tarefa de classificação multiclasse do conjunto de dados examinado na estapa anterior. A construção do modelo envolve o carregamento de dados, seleção de modelos, treinamento de modelos, validação cruzada e avaliação no conjunto de testes.

#### Nós usamos o "k-fold cross validation" para avaliar os modelos, de forma a avaliar a capacidade de generalização domodelo e reduzir o risco de "overfitting". Na creação dos modelos utilizamos k=4 por termos apenas 2000 dados. Para comparar tambem fizemos modelos sem validação cruzada.


In [2]:
from sklearn.model_selection import KFold
from pycm import ConfusionMatrix
from sklearn.metrics import accuracy_score

class KFoldCrossValidator:
    """
    Performs k-fold cross-validation for model evaluation.
    """
    def __init__(self, k=5):
        """
        Initializes the KFoldCrossValidator object.

        Parameters:
            k (int): Number of folds for cross-validation. Default is 5.
        """
        self.k = k
        self.kf = KFold(n_splits=k, shuffle=True)
        self.cm = None
        self.accuracy_scores = []
        self.sensitivity_scores = []
        self.specificity_scores = []

    def cross_validate(self, model, X, y):
        """
        Performs k-fold cross-validation on the given model using the provided features and labels.

        Parameters:
            model: Machine learning model to be evaluated.
            X (array-like): Features.
            y (array-like): Labels.

        Returns:
            tuple: Average accuracy, sensitivity, and specificity scores.
        """
        first = True
        avg_sensitivity = 0
        avg_specificity = 0
        for train_index, val_index in self.kf.split(X):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            self.cm = ConfusionMatrix(actual_vector=list(y_val), predict_vector=list(y_pred))
            
            if (first):
                print("Fisrt Confusion Matrix:\n",self.cm)
                first = False

            self.accuracy_scores.append(accuracy_score(y_val, y_pred))
            if self.cm.TPR_Macro != 'None':
                self.sensitivity_scores.append(float(self.cm.TPR_Macro))
            if self.cm.TNR_Macro != 'None':
                self.specificity_scores.append(float(self.cm.TNR_Macro))

        avg_accuracy = sum(self.accuracy_scores) / len(self.accuracy_scores)
        if len(self.sensitivity_scores) != 0:
            avg_sensitivity = sum(self.sensitivity_scores) / len(self.sensitivity_scores)
        if len(self.specificity_scores) != 0:
            avg_specificity = sum(self.specificity_scores) / len(self.specificity_scores)
            
        print("Last Confusion Matrix:\n",self.cm)

        return (( "avg_accuracy", avg_accuracy, self.k ) ,
                ( "avg_sensitivity" , avg_sensitivity , len(self.sensitivity_scores) ),
                ( "avg_specificity" ,avg_specificity ,len(self.specificity_scores) ))

    def evaluate_on_test_set(self, model, X_test, y_test):
        """
        Evaluates the trained model on the test set.

        Parameters:
            model: Trained machine learning model.
            X_test (array-like): Test features.
            y_test (array-like): Test labels.

        Returns:
            tuple: Accuracy, sensitivity, and specificity scores on the test set.
        """
        y_pred = model.predict(X_test)
        cm = ConfusionMatrix(actual_vector=list(y_test), predict_vector=list(y_pred))
        accuracy = cm.Overall_ACC
        sensitivity = cm.TPR_Macro
        specificity = cm.TNR_Macro
        return accuracy, sensitivity, specificity

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib
# Loading Data
Complete_data = pd.read_csv("Complete_treated_dataset")

# Separating the labels from de data
data_targets_y = Complete_data["fetal_health"].astype(int)
Complete_data_X = Complete_data.drop(["fetal_health"], axis=1)

# Spliting the dataset in tain and 
Complete_data_X_train, Complete_data_X_test,  Complete_data_y_train, Complete_data_y_test \
    = train_test_split(Complete_data_X, data_targets_y, test_size=0.2, random_state=50)

def creat_model_with_cross_validation(name, model, x, y, CrossValidations=4):
    """
    Creates and fits a model with x, y data, trains and tests.
    :param name: fileName
    :param model: Model
    :param x: x_data pd
    :param y: label data pd
    :param CrossValidations: number of k-folds cross-validations
    :return: None
    """
    kFoldCrossValidator = KFoldCrossValidator(3)
    accuracy, sensitivity, specificity = kFoldCrossValidator.cross_validate(model, np.array(x), np.array(y))
    print("Accuracy:", accuracy[1])
    print("Sensitivity:", sensitivity[1])
    print("Specificity:", specificity[1])
    joblib.dump(model, "Models\\"+name)
    
def creat_model_without_cross_validation(name, model,
                                         X_Train=Complete_data_X_train,
                                         x_Test=Complete_data_X_test,
                                         Y_Train=Complete_data_y_train,
                                         y_Test=Complete_data_y_test):
    """
    Creates and fits a model with Complete_data_X_train, Complete_data_y_train data, trains and tests.
    :param name: fileName
    :param model: Model
    :return: None
    """
    model.fit(X_Train, Y_Train)
    y_pred = model.predict(x_Test)
    conf_matrix = ConfusionMatrix(actual_vector=np.array(y_Test), predict_vector=np.array(y_pred))
    print(conf_matrix)
    joblib.dump(model, "Models\\"+name)

    

## Supervised learning

### 1 - KNN
#### Implementamos o KNN utilizando somente o numpy e creamos o modelo. O melhor k encontrado foi 5, para k maiores e menores as metricas de avaliação começaram a apresentar valores piores.

In [4]:
import numpy as np

class KNN:
    """
    K-Nearest Neighbors (kNN) classifier implemented using numpy arrays.

    Parameters:
    - k: int, optional (default=5)
        Number of nearest neighbors to consider during classification.
    """

    def __init__(self, k=5):
        """
        Initialize the KNN classifier.

        Parameters:
        - k: int, optional (default=5)
            Number of nearest neighbors to consider during classification.
        """
        self.k = k

    def fit(self, X_train, y_train):
        """
        Train the KNN classifier.

        Parameters:
        - X_train: numpy array, shape (n_samples, n_features)
            Training data.
        - y_train: numpy array, shape (n_samples,)
            Labels corresponding to the training data.
        """
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, x_test):
        """
        Predict the class labels for test data.

        Parameters:
        - X_test: numpy array, shape (n_samples, n_features)
            Test data.

        Returns:
        - y_pred: numpy array, shape (n_samples,)
            Predicted class labels for the test data.
        """
        y_pred = np.empty(x_test.shape[0], dtype=self.y_train.dtype)

        # Iterate over each test sample
        for i, sample in enumerate(x_test):

            # Calculate Euclidean distances between the test sample and all training samples
            distances = np.linalg.norm(self.X_train - sample, axis=1)

            # Find the indices of the k nearest neighbors
            nearest_indices = np.argsort(distances)[:self.k]

            # Get the labels of the k nearest neighbors
            nearest_labels = self.y_train[nearest_indices]

            # Predict the class label based on majority vote
            y_pred[i] = np.bincount(nearest_labels).argmax()

        return y_pred

In [4]:
creat_model_with_cross_validation("KNN_Model",KNN(5),Complete_data_X,data_targets_y,3)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         519       21        4         

2         42        50        3         

3         12        15        38        





Overall Statistics : 

95% CI                                                            (0.83675,0.88768)
ACC Macro                                                         0.90814
ARI                                                               0.57789
AUNP                                                              0.79622
AUNU                                                              0.77623
Bangdiwala B                                                      0.84665
Bennett S                                                         0.79332
CBA                                                               0.67223
CSI                                                               0.46552
Chi-Squared                                                       529.7702
Chi-Squared DF         

In [5]:
knn = KNN(5)
knn.fit(Complete_data_X_train.values, Complete_data_y_train.values)
y_pred = knn.predict(Complete_data_X_test.values)
conf_matrix = ConfusionMatrix(actual_vector=np.array(Complete_data_y_test), predict_vector=np.array(y_pred))
print(conf_matrix)

Predict   1         2         3         
Actual
1         317       17        2         

2         14        32        3         

3         6         4         28        





Overall Statistics : 

95% CI                                                            (0.86158,0.92092)
ACC Macro                                                         0.9275
ARI                                                               0.64722
AUNP                                                              0.85049
AUNU                                                              0.83906
Bangdiwala B                                                      0.87371
Bennett S                                                         0.83688
CBA                                                               0.76042
CSI                                                               0.57542
Chi-Squared                                                       402.58682
Chi-Squared DF                                  

### 2 - Gaussian NB

In [6]:
from sklearn.naive_bayes import GaussianNB
creat_model_with_cross_validation("GaussianNB_CrossModel",GaussianNB(),Complete_data_X,data_targets_y,3)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         374       158       21        

2         0         85        6         

3         1         22        37        





Overall Statistics : 

95% CI                                                            (0.67084,0.73825)
ACC Macro                                                         0.80303
ARI                                                               0.28406
AUNP                                                              0.82891
AUNU                                                              0.81414
Bangdiwala B                                                      0.6309
Bennett S                                                         0.55682
CBA                                                               0.52506
CSI                                                               0.37442
Chi-Squared                                                       387.27101
Chi-Squared DF         

In [7]:
creat_model_without_cross_validation("GaussianNB_Model", GaussianNB() )

Predict   1         2         3         
Actual
1         271       56        9         

2         2         45        2         

3         1         14        23        





Overall Statistics : 

95% CI                                                            (0.7634,0.83944)
ACC Macro                                                         0.86761
ARI                                                               0.46972
AUNP                                                              0.87489
AUNU                                                              0.84666
Bangdiwala B                                                      0.7677
Bennett S                                                         0.70213
CBA                                                               0.60104
CSI                                                               0.46233
Chi-Squared                                                       297.12223
Chi-Squared DF                                   

### 3 - Decision Tree Classifier

In [8]:
from sklearn.tree import DecisionTreeClassifier
Dtree = DecisionTreeClassifier()
creat_model_with_cross_validation("Dtree_CrossModel",Dtree,Complete_data_X,data_targets_y)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         529       18        3         

2         20        79        4         

3         3         0         48        





Overall Statistics : 

95% CI                                                            (0.9132,0.95044)
ACC Macro                                                         0.95455
ARI                                                               0.76136
AUNP                                                              0.90499
AUNU                                                              0.91333
Bangdiwala B                                                      0.91147
Bennett S                                                         0.89773
CBA                                                               0.86602
CSI                                                               0.77183
Chi-Squared                                                       974.57887
Chi-Squared DF         

In [9]:
creat_model_without_cross_validation("Dtree_Model", DecisionTreeClassifier() )

Predict   1         2         3         
Actual
1         317       17        2         

2         9         38        2         

3         1         0         37        





Overall Statistics : 

95% CI                                                            (0.90188,0.95155)
ACC Macro                                                         0.95114
ARI                                                               0.74558
AUNP                                                              0.91461
AUNU                                                              0.92031
Bangdiwala B                                                      0.90517
Bennett S                                                         0.89007
CBA                                                               0.8456
CSI                                                               0.7518
Chi-Squared                                                       571.96112
Chi-Squared DF                                   

### 4 - Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
creat_model_with_cross_validation("Lregression_CrossModel",LogisticRegression(),Complete_data_X,data_targets_y)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         543       15        2         

2         37        37        10        

3         4         10        46        





Overall Statistics : 

95% CI                                                            (0.86602,0.91239)
ACC Macro                                                         0.92614
ARI                                                               0.66667
AUNP                                                              0.82816
AUNU                                                              0.80552
Bangdiwala B                                                      0.88862
Bennett S                                                         0.83381
CBA                                                               0.71231
CSI                                                               0.49882
Chi-Squared                                                       601.81698
Chi-Squared DF        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Last Confusion Matrix:
 Predict   1         2         3         
Actual
1         530       7         1         

2         33        66        3         

3         3         10        50        





Overall Statistics : 

95% CI                                                            (0.89874,0.9391)
ACC Macro                                                         0.94595
ARI                                                               0.75445
AUNP                                                              0.87364
AUNU                                                              0.86219
Bangdiwala B                                                      0.90954
Bennett S                                                         0.87838
CBA                                                               0.79237
CSI                                                               0.69445
Chi-Squared                                                       852.64584
Chi-Squared DF          

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
creat_model_without_cross_validation("Lregression_Model",LogisticRegression())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Predict   1         2         3         
Actual
1         326       8         2         

2         11        37        1         

3         1         11        26        





Overall Statistics : 

95% CI                                                            (0.89371,0.94553)
ACC Macro                                                         0.94641
ARI                                                               0.78044
AUNP                                                              0.90174
AUNU                                                              0.86884
Bangdiwala B                                                      0.92256
Bennett S                                                         0.87943
CBA                                                               0.76981
CSI                                                               0.64377
Chi-Squared                                                       472.44398
Chi-Squared DF                                 

### 2 - Ensemble models Bagging 
### 2.1 - Extra Trees Classifier


In [12]:
from sklearn.ensemble import ExtraTreesClassifier
creat_model_with_cross_validation("ExtraTreesClassifier_CrossModel",ExtraTreesClassifier(),Complete_data_X,data_targets_y)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         544       9         0         

2         25        65        3         

3         3         6         49        





Overall Statistics : 

95% CI                                                            (0.9164,0.95291)
ACC Macro                                                         0.95644
ARI                                                               0.78681
AUNP                                                              0.89269
AUNU                                                              0.88548
Bangdiwala B                                                      0.92591
Bennett S                                                         0.90199
CBA                                                               0.8316
CSI                                                               0.74444
Chi-Squared                                                       931.39588
Chi-Squared DF          

In [13]:
creat_model_without_cross_validation("ExtraTreesClassifier_Model",ExtraTreesClassifier())


Predict   1         2         3         
Actual
1         332       3         1         

2         6         42        1         

3         1         6         31        





Overall Statistics : 

95% CI                                                            (0.93821,0.97668)
ACC Macro                                                         0.97163
ARI                                                               0.88462
AUNP                                                              0.94514
AUNU                                                              0.92522
Bangdiwala B                                                      0.95999
Bennett S                                                         0.93617
CBA                                                               0.87289
CSI                                                               0.8011
Chi-Squared                                                       618.41613
Chi-Squared DF                                  

### 2.2 Bagging Classifier


In [14]:
from sklearn.ensemble import BaggingClassifier
creat_model_with_cross_validation("BaggingClassifier_CrossModel",BaggingClassifier(),Complete_data_X,data_targets_y)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         534       10        2         

2         32        66        1         

3         3         2         54        





Overall Statistics : 

95% CI                                                            (0.91,0.94795)
ACC Macro                                                         0.95265
ARI                                                               0.74595
AUNP                                                              0.877
AUNU                                                              0.88566
Bangdiwala B                                                      0.90884
Bennett S                                                         0.89347
CBA                                                               0.84014
CSI                                                               0.76398
Chi-Squared                                                       965.19621
Chi-Squared DF             

In [15]:
creat_model_without_cross_validation("BaggingClassifier_Model",BaggingClassifier())

Predict   1         2         3         
Actual
1         329       4         3         

2         12        35        2         

3         0         0         38        





Overall Statistics : 

95% CI                                                            (0.92965,0.97105)
ACC Macro                                                         0.9669
ARI                                                               0.82254
AUNP                                                              0.91919
AUNU                                                              0.92197
Bangdiwala B                                                      0.93895
Bennett S                                                         0.92553
CBA                                                               0.85427
CSI                                                               0.81314
Chi-Squared                                                       626.80826
Chi-Squared DF                                  

### 2.3 Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier
creat_model_with_cross_validation("RandomForestClassifier_CrossModel",RandomForestClassifier(),Complete_data_X,data_targets_y)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         536       5         0         

2         31        79        2         

3         4         6         41        





Overall Statistics : 

95% CI                                                            (0.9132,0.95044)
ACC Macro                                                         0.95455
ARI                                                               0.77331
AUNP                                                              0.88182
AUNU                                                              0.87728
Bangdiwala B                                                      0.91916
Bennett S                                                         0.89773
CBA                                                               0.81599
CSI                                                               0.75667
Chi-Squared                                                       943.68987
Chi-Squared DF         

In [17]:
creat_model_without_cross_validation("RandomForestClassifier_Model",RandomForestClassifier())

Predict   1         2         3         
Actual
1         332       3         1         

2         8         40        1         

3         1         4         33        





Overall Statistics : 

95% CI                                                            (0.93821,0.97668)
ACC Macro                                                         0.97163
ARI                                                               0.86966
AUNP                                                              0.93632
AUNU                                                              0.92425
Bangdiwala B                                                      0.9552
Bennett S                                                         0.93617
CBA                                                               0.88612
CSI                                                               0.81346
Chi-Squared                                                       630.12766
Chi-Squared DF                                  

### 3 - Ensemble models boosting
### 3.1 - Gradient Boosting Classifier

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
creat_model_with_cross_validation("GradientBoostingClassifier_CrossModel",GradientBoostingClassifier(),Complete_data_X,data_targets_y)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         534       9         0         

2         18        76        2         

3         4         0         61        





Overall Statistics : 

95% CI                                                            (0.93751,0.96874)
ACC Macro                                                         0.96875
ARI                                                               0.83271
AUNP                                                              0.92271
AUNU                                                              0.9265
Bangdiwala B                                                      0.9379
Bennett S                                                         0.92969
CBA                                                               0.89685
CSI                                                               0.84545
Chi-Squared                                                       1105.24786
Chi-Squared DF         

In [19]:
creat_model_without_cross_validation("GradientBoostingClassifier_Model",GradientBoostingClassifier())


Predict   1         2         3         
Actual
1         332       1         3         

2         8         39        2         

3         0         0         38        





Overall Statistics : 

95% CI                                                            (0.94986,0.98395)
ACC Macro                                                         0.97794
ARI                                                               0.8848
AUNP                                                              0.94619
AUNU                                                              0.94607
Bangdiwala B                                                      0.96058
Bennett S                                                         0.95035
CBA                                                               0.88537
CSI                                                               0.87307
Chi-Squared                                                       689.9234
Chi-Squared DF                                   

### 3.2  AdaBoost Classifier


In [20]:
from sklearn.ensemble import AdaBoostClassifier
creat_model_with_cross_validation("GradientBoostingClassifier_CrossModel",AdaBoostClassifier(),Complete_data_X,data_targets_y)

Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         511       43        0         

2         26        68        0         

3         16        3         37        





Overall Statistics : 

95% CI                                                            (0.85057,0.89943)
ACC Macro                                                         0.91667
ARI                                                               0.56251
AUNP                                                              0.8223
AUNU                                                              0.82518
Bangdiwala B                                                      0.83695
Bennett S                                                         0.8125
CBA                                                               0.72653
CSI                                                               0.60901
Chi-Squared                                                       701.53914
Chi-Squared DF          

In [21]:
creat_model_without_cross_validation("GradientBoostingClassifier_Model",AdaBoostClassifier())


Predict   1         2         3         
Actual
1         310       23        3         

2         9         39        1         

3         3         0         35        





Overall Statistics : 

95% CI                                                            (0.88023,0.93537)
ACC Macro                                                         0.93853
ARI                                                               0.67809
AUNP                                                              0.89509
AUNU                                                              0.90496
Bangdiwala B                                                      0.87698
Bennett S                                                         0.8617
CBA                                                               0.81636
CSI                                                               0.7096
Chi-Squared                                                       527.88401
Chi-Squared DF                                   

### 3.3 Histogram Gradient Boosting Classifier

In [22]:
from sklearn.ensemble import HistGradientBoostingClassifier
creat_model_with_cross_validation("HistGradientBoostingClassifier_CrossModel",HistGradientBoostingClassifier(),Complete_data_X,data_targets_y)


Fisrt Confusion Matrix:
 Predict   1         2         3         
Actual
1         551       7         1         

2         12        73        3         

3         1         3         53        





Overall Statistics : 

95% CI                                                            (0.94746,0.97583)
ACC Macro                                                         0.97443
ARI                                                               0.87537
AUNP                                                              0.94396
AUNU                                                              0.93883
Bangdiwala B                                                      0.95676
Bennett S                                                         0.94247
CBA                                                               0.91211
CSI                                                               0.84378
Chi-Squared                                                       1101.66105
Chi-Squared DF       

In [23]:
creat_model_without_cross_validation("GradientBoostingClassifier_Model",HistGradientBoostingClassifier())

Predict   1         2         3         
Actual
1         329       5         2         

2         8         40        1         

3         0         0         38        





Overall Statistics : 

95% CI                                                            (0.94399,0.98036)
ACC Macro                                                         0.97478
ARI                                                               0.86081
AUNP                                                              0.94344
AUNU                                                              0.94706
Bangdiwala B                                                      0.95119
Bennett S                                                         0.94326
CBA                                                               0.90647
CSI                                                               0.86249
Chi-Squared                                                       683.01493
Chi-Squared DF                                 

### 4 - Clustering
### 4.1 - Kmeans
#### 4.1.1 - 3 classes

In [24]:
from sklearn.cluster import KMeans
data_targets_y_m = data_targets_y -1
creat_model_with_cross_validation("KMeans3c_CrossModel",KMeans(n_clusters=2, random_state=20, n_init="auto"),Complete_data_X,data_targets_y_m)

Fisrt Confusion Matrix:
 Predict   0         1         2         
Actual
0         247       295       0         

1         84        18        0         

2         32        28        0         





Overall Statistics : 

95% CI                                                            (0.34063,0.41221)
ACC Macro                                                         0.58428
ARI                                                               0.02468
AUNP                                                              0.3737
AUNU                                                              0.3966
Bangdiwala B                                                      0.26491
Bennett S                                                         0.06463
CBA                                                               0.1695
CSI                                                               None
Chi-Squared                                                       None
Chi-Squared DF                   

In [25]:
Complete_data_y_test_m = Complete_data_y_test - 1
Complete_data_y_train_m = Complete_data_y_train - 1
creat_model_without_cross_validation("KMeans3c_Model", KMeans(n_clusters=2, random_state=0, n_init="auto"),y_Test=Complete_data_y_test_m,Y_Train=Complete_data_y_train_m)


Predict   0         1         2         
Actual
0         146       190       0         

1         39        10        0         

2         15        23        0         





Overall Statistics : 

95% CI                                                            (0.32281,0.41477)
ACC Macro                                                         0.5792
ARI                                                               0.02632
AUNP                                                              0.4049
AUNU                                                              0.40807
Bangdiwala B                                                      0.27412
Bennett S                                                         0.05319
CBA                                                               0.15979
CSI                                                               None
Chi-Squared                                                       None
Chi-Squared DF                                           

#### 4.1.2 - 4 classes

In [26]:
creat_model_with_cross_validation("KMeans4c_CrossModel",KMeans(n_clusters=4, random_state=20, n_init="auto"),Complete_data_X,data_targets_y_m)

Fisrt Confusion Matrix:
 Predict   0         1         2         3         
Actual
0         166       147       114       134       

1         9         2         48        25        

2         1         33        12        13        

3         0         0         0         0         





Overall Statistics : 

95% CI                                                            (0.22346,0.28791)
ACC Macro                                                         0.62784
ARI                                                               0.02098
AUNP                                                              None
AUNU                                                              None
Bangdiwala B                                                      0.2229
Bennett S                                                         0.00758
CBA                                                               0.09396
CSI                                                               None
Chi-Squared     

In [27]:
creat_model_without_cross_validation("KMeans4c_Model", KMeans(n_clusters=4, random_state=0, n_init="auto"),y_Test=Complete_data_y_test_m,Y_Train=Complete_data_y_train_m)


Predict   0         1         2         3         
Actual
0         80        92        64        100       

1         9         1         30        9         

2         2         22        13        1         

3         0         0         0         0         





Overall Statistics : 

95% CI                                                            (0.1826,0.26184)
ACC Macro                                                         0.61111
ARI                                                               0.02286
AUNP                                                              None
AUNU                                                              None
Bangdiwala B                                                      0.16312
Bennett S                                                         -0.03704
CBA                                                               0.09207
CSI                                                               None
Chi-Squared                             

### 4.2 gaussian Mixture
#### 4.2.1 gaussian Mixture 3 classes

In [28]:
from sklearn.mixture import GaussianMixture
creat_model_with_cross_validation("GaussianMixture3c_CrossModel", GaussianMixture(n_components=3),Complete_data_X,data_targets_y_m)


Fisrt Confusion Matrix:
 Predict   0         1         2         
Actual
0         387       42        117       

1         76        1         25        

2         17        29        10        





Overall Statistics : 

95% CI                                                            (0.52872,0.60196)
ACC Macro                                                         0.71023
ARI                                                               0.0822
AUNP                                                              0.53716
AUNU                                                              0.49525
Bangdiwala B                                                      0.53922
Bennett S                                                         0.34801
CBA                                                               0.26146
CSI                                                               -0.40564
Chi-Squared                                                       120.23016
Chi-Squared DF        

In [29]:
creat_model_without_cross_validation("GaussianMixture3c_Model", GaussianMixture(n_components=3),y_Test=Complete_data_y_test_m,Y_Train=Complete_data_y_train_m)


Predict   0         1         2         
Actual
0         243       24        69        

1         42        1         6         

2         13        22        3         





Overall Statistics : 

95% CI                                                            (0.53695,0.6309)
ACC Macro                                                         0.72262
ARI                                                               0.07888
AUNP                                                              0.52501
AUNU                                                              0.47876
Bangdiwala B                                                      0.56036
Bennett S                                                         0.37589
CBA                                                               0.26069
CSI                                                               -0.43409
Chi-Squared                                                       96.22711
Chi-Squared DF                                  

#### 4.2.2 gaussian Mixture 4 classes

In [30]:
creat_model_with_cross_validation("GaussianMixture3c_CrossModel", GaussianMixture(n_components=4),Complete_data_X,data_targets_y_m)


Fisrt Confusion Matrix:
 Predict   0         1         2         3         
Actual
0         49        83        334       77        

1         8         1         39        54        

2         8         32        8         11        

3         0         0         0         0         





Overall Statistics : 

95% CI                                                            (0.06208,0.1027)
ACC Macro                                                         0.54119
ARI                                                               0.13958
AUNP                                                              None
AUNU                                                              None
Bangdiwala B                                                      0.03543
Bennett S                                                         -0.22348
CBA                                                               0.02996
CSI                                                               None
Chi-Squared    

In [31]:
creat_model_without_cross_validation("GaussianMixture3c_Model", GaussianMixture(n_components=4),y_Test=Complete_data_y_test_m,Y_Train=Complete_data_y_train_m)


Predict   0         1         2         3         
Actual
0         189       33        60        54        

1         26        1         1         21        

2         5         1         22        10        

3         0         0         0         0         





Overall Statistics : 

95% CI                                                            (0.45353,0.54883)
ACC Macro                                                         0.75059
ARI                                                               0.06948
AUNP                                                              None
AUNU                                                              None
Bangdiwala B                                                      0.45953
Bennett S                                                         0.33491
CBA                                                               0.21199
CSI                                                               None
Chi-Squared                             

### 4.3 DBSCAN 

In [6]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=29, min_samples=2)# 29,2
dbscan.fit(Complete_data_X_train)
y_pred = dbscan.fit_predict(Complete_data_X_test)
print( ConfusionMatrix( actual_vector=np.array(Complete_data_y_test), predict_vector=np.array(y_pred) ) )
joblib.dump(dbscan, "Models\\"+"DBSCAN")

Predict   -1        0         1         2         3         
Actual
-1        0         0         0         0         0         

0         0         0         0         0         0         

1         2         269       65        0         0         

2         0         48        1         0         0         

3         0         18        20        0         0         





Overall Statistics : 

95% CI                                                            (0.1193,0.18803)
ACC Macro                                                         0.66147
ARI                                                               0.03796
AUNP                                                              None
AUNU                                                              None
Bangdiwala B                                                      0.14621
Bennett S                                                         -0.05792
CBA                                                               0.03869

['Models\\DBSCAN']

### 5 Neural Network

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense

# Load your dataset (assuming it's in CSV format)
complete_data = pd.read_csv("Complete_treated_dataset")
y = data_targets_y - 1

# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Complete_data, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential([
    Flatten(input_shape=Complete_data.shape[1:]),     # Flatten the input shape
    # Dense(128, activation='relu'),
    # Dense(3, activation='softmax')
    Dense(2005, activation="selu"),
    Dense(2005, activation='relu'),
    Dense(3, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',                   # Optimizer: Adam
              loss='sparse_categorical_crossentropy',  # Loss function: Sparse categorical crossentropy
              metrics=['accuracy'])               # Metrics to monitor: Accuracy

# Train the model
model.fit(X_train, y_train, epochs=50, validation_split=0.2)  # Assuming you want to use 20% of data for validation

y_pred = model.predict(X_test)
classes_previstas = np.argmax(y_pred, axis=1).astype(int)

conf_matrix = ConfusionMatrix(actual_vector=np.array(y_test.astype(int)), predict_vector=np.array(classes_previstas))
print(f'Test accuracy: {conf_matrix}')
joblib.dump(model, "Models\\"+"NeuralNetwork")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: Predict   0         1         2         
Actual
0         328       0         0         

1         7         63        0         

2         2         2         21        





Overall Statistics : 

95% CI                                                            (0.95883,0.98916)
ACC Macro                                                         0.98266
ARI                                

['Models\\NeuralNetwork']