# <font color = darkblue> Bem-vindo/a!
##### Iremos guiá-lo por este notebook relativo a uma data analysis sobre dados de pacientes de carcinoma hepatocelular.

### Índice:

    1. Introdução à análise:
        1.1 Contexto do problema
        1.2 Expectativas e objetivos
        
    2. Inicialização do dataset
    
    3. Estatísticas descritivas báscias:
        3.1 Média
        3.2 Mediana
        3.3 Desvio
        3.4 Assimetria
        3.5 Curtose
        
    4. Relotório de Análise - DataPrep
    
    5. Inputação dos missing values:
        5.1 Identificação visual dos missing values
        5.2 Heterogeneous Euclidean-Overlap Metric para medir distâncias entre pacientes
        5.3 Inputação de missing values por HEOM
    
    6. Ajuste de Outliers
        6.1 Identificação visual dos outliers
        6.2 K Nearest Neighbours para enquadramento dos ouliers 
        6.2 Expansão visual dos outliers



# <font color = darkblue> 2. Inicialização do dataset

In [2]:
import pandas as pd
import numpy as np
import heapq
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
class Dataset:
    def __init__(self, df, missing_values):
        self.df = df
        self.missing_values = missing_values


    def pintarMissingValues(self):#pintar a tabela de missing values
        if self.missing_values is not None:#se existirem missing values
            self.df.replace(self.missing_values, "NaN", inplace=True)#substituir missing values por string "NaN" devido a limitação do site 
            return self.df.style.applymap(lambda valor: "color: red;" if valor=="NaN" else "")#pintar missing values a vermelho
        else: return self.df #se não existirem missing values


    def missing_values_percentagem(self):#Percentagem de missing values
        self.df.replace(self.missing_values, np.nan, inplace=True)#substituir missing values por NaN e nao string "NaN"
        missing_values_percentages = self.df.isnull().mean() * 100#calcular a percentagem de missing values
        return missing_values_percentages.tolist()#retornar a percentagem de missing values
    
    
    def remove_int_columns(self):
        df_copy = self.df.copy()  # create a copy of the dataframe
        numerical=self.df_num()
        common_columns = set(self.df.columns) & set(numerical.columns)
        df_copy = df_copy.drop(common_columns, axis=1)
        
        return df_copy
    

    def df_num(self):
        # Replace missing values with None
        dataframe= self.replace_nan_with_none()

        # Convert all columns to numeric, replacing non-numeric values with NaN
        for col in dataframe.columns:
            dataframe[col] = pd.to_numeric(dataframe[col], errors='coerce')

        # Remove columns that only contain None values
        self.df = dataframe.dropna(axis=1, how='all')

        return self.df


    def replace_nan_with_none(self):
        self.df.replace(self.missing_values, None, inplace=True)
        return self.df


    def pintarOutliers(self, df, outliers):
        def highlight_value(series, column):#Pintar as células que são outliers de azul
            return ['background-color: blue' if (column, index) in outliers else '' for index in series.index]
        return df.style.apply(lambda x: highlight_value(x, x.name), axis=0)#Aplicar a função a cada coluna
    

    def tabelaHEOM(self):
        self.df = self.replace_nan_with_none()#Trocar missing values para none
        tabela = pd.DataFrame()
        for i in range(len(self.df)):
            lista = []
            for j in range(len(self.df)):#Não interessa comparar pares de pacientes duas vezes
                if i >= j:
                    lista.append("X")# colocar x por motivos estéticos
                else:
                    lista.append(self.HEOM(i, j))# lista de um paciente em calculo HEOM

            tabela = pd.concat([tabela, pd.DataFrame({i: lista})], axis=1)#adicionar a lista à tabela
        return tabela
    

    def HEOM(self, paciente_1, paciente_2): #Heterogeneous Euclidean-Overlap Metric
        soma = 0
        for feature in self.df.columns:# iterar sobre as V
            distancia = self.distanciaGeral(feature, paciente_1, paciente_2)# calcular a sua "distancia"
            soma += distancia**2
        soma= soma**(1/2)
        return soma
    

    def distanciaGeral(self, feature:str, paciente_1:int, paciente_2:int)->int:
        try :#Se a variavel for numerica vem para aqui
            #distancia normalizada
            valorPaciente_1 = float(self.df.loc[paciente_1, feature])
            valorPaciente_2 = float(self.df.loc[paciente_2, feature])
            numeric_feature = pd.to_numeric(self.df[feature], errors='coerce')
            return abs(valorPaciente_1 - valorPaciente_2) / (numeric_feature.max() - numeric_feature.min())# retornar a range 
        except :#Se a variavel for categorica vem para aqui
            valorPaciente_1 = self.df.loc[paciente_1, feature]
            valorPaciente_2 = self.df.loc[paciente_2, feature]
            if valorPaciente_1 == valorPaciente_2 and  not pd.isna(valorPaciente_1):#Se forem iguais e não forem missing values
                return 0
            else: 
                return 1
    

    def outliers(self,info:str,vizinhos=None)->pd.DataFrame:
        # Selecionar apenas as colunas numéricas
        categorical_features = self.remove_int_columns() 
        numeric_df = self.df_num()

        colunas_numericas = numeric_df.columns
        if info == 'style':
            outliers = set()
        for coluna in colunas_numericas:#calcular os outliers usando o IQR
            if info == 'tratamento':
                outliers = []
            q1 = numeric_df[coluna].quantile(0.25)
            q3 = numeric_df[coluna].quantile(0.75)
            iqr = q3 - q1
            limite_inferior = q1 - 1.5 * iqr
            limite_superior = q3 + 1.5 * iqr
            for index, value in numeric_df[coluna].items():#adicionar outliers ao set
                if value < limite_inferior or value > limite_superior:
                    if info == 'tratamento' and coluna not in ["Iron", "Sat", "Ferritin"]:
                        if self.df.loc[index, coluna] > limite_superior * 5 or self.df.loc[index, coluna] < limite_inferior * 5:
                            outliers.append((index, coluna))
                    elif info == 'style':
                        outliers.add((coluna, index))
            if info == 'tratamento':
                self.df= self.tratamentoOutliers(outliers, coluna,vizinhos)
        if info == 'style':
            # Apply styling to outliers
            styled_df = self.pintarOutliers(numeric_df, outliers)
            return styled_df
        if info == 'tratamento': 
            self.df = (pd.concat([categorical_features,self.df ], axis=1))
            return self.df
    
    
    def tratamentoOutliers(self, outliers, coluna,vizinhos):
        
        lista_valores = self.df[coluna].tolist()#todos os valores da coluna 
        contador = -1
        valores_out = [self.df.loc[index,coluna] for index,coluna in outliers]#valores dos outliers
        for valor_outlier in valores_out:# iterar por todos os outliers
            contador+=1
            outlier = valor_outlier
            dicionario_distancias = []
            for valor in lista_valores:

                if outlier != valor and valor not in valores_out and not pd.isna(valor):

                    distancia = self.HEOM(lista_valores.index(valor), lista_valores.index(outlier))#calcular a distancia entre o outlier e os outros valores
                    if len(dicionario_distancias) < vizinhos:
                        heapq.heappush(dicionario_distancias, (-distancia, valor))
                    else:
                        if -distancia > dicionario_distancias[0][0]:
                            heapq.heapreplace(dicionario_distancias, (-distancia, valor))

            k_proximos = [abs(item[1]) for item in dicionario_distancias]# selecionar os k vizinhos mais proximos
            
            media = sum(k_proximos)/len(k_proximos)
            self.df.loc[outliers[contador][0], coluna] = media
        return self.df


    def fill_missing_values(self, nr_vizinhos:int) -> pd.DataFrame:

        self.df = self.replace_nan_with_none() # Replace missing values with None 

        self.df = self.df.drop(['Iron', 'Sat', 'Ferritin'], axis=1)# Drop unnecessary columns
        df_copiada = self.df.copy()# Create a copy of the DataFrame

        for i in range(len(self.df)): # Iterate over each row
            row = self.df.iloc[i]
            
            if row.isnull().any():# Check if the row has any missing values
                closest_rows = self.linhas_mais_proximas(nr_vizinhos, i)# Get the indices of the closest rows
    
                for col in self.df.columns:# Iterate over each column
                    if pd.isnull(row[col]): # If the value is missing, replace it with the most common value or mean from the closest rows
                        df_copiada.loc[i, col] = self.subs_na_tabela(closest_rows, col,nr_vizinhos,i)
        return df_copiada
    

    def subs_na_tabela(self, closest_rows:list, col:int,vizinhos,i)->float | str :
        # Initialize values
        column_values = []

        for row_index in closest_rows:
            try:
                # Check the type of values
                value = float(self.df.loc[row_index, col])
            except:
                value = self.df.loc[row_index, col]

            if value is not None and not pd.isna(value):
                column_values.append(value)
        if len(column_values) == 0:

            return self.subs_na_tabela(self.linhas_mais_proximas(vizinhos+1,i), col,vizinhos+1,i)
        # Calculate the result based on the type of values
        if isinstance(column_values[0], str):

            # If values are strings, return the most frequent value
            return max(set(column_values), key=column_values.count)
        elif isinstance(column_values[0], (int, float)):

            # If values are numeric, return the mean
            return np.mean(column_values)
        

    def linhas_mais_proximas(self, vizinhos:int,i:int)->list: # Calculate the HEOM distance for each other row
       
        heom_values = []

        for j in range(len(self.df)):

            if j != i:
                heom_distance = self.HEOM(i, j)# Calculate the HEOM distance
                if len(heom_values) < vizinhos: # If we have less than 'vizinhos' distances, we add it to the heap

                    heapq.heappush(heom_values, (-heom_distance, j))
                else:
                    if -heom_distance > heom_values[0][0]: # If the current distance is smaller than the largest distance in the heap, we replace it
                        heapq.heapreplace(heom_values, (-heom_distance, j))
    
        # Get the rows with the smallest HEOM distance
        closest_rows = [item[1] for item in heom_values]

        return closest_rows
    

    def categorical_to_numerical(self):
        """
        _summary_: converts all categorical features to numerical values

        _conversion_dictionary_:
            Male -> 0
            Female -> 1
            No -> 0
            Yes -> 1
            Disabled -> 0
            Ambulatory -> 1
            Restricted -> 2
            Selfcare -> 3
            Active -> 4
            None -> 0
            Grade I/II -> 1
            Grade III/IV -> 2
            Mild -> 1
            Moderate/Severe -> 2
            Dies -> 0
            Lives -> 1
            
        """
        words = ("Male","Female","No","Yes","Disabled","Ambulatory",
                 "Restricted","Selfcare","Active","None","Grade I/II",
                 "Grade III/IV","Mild","Moderate/Severe","Dies","Lives")
        values = (0,1,0,1,0,1,2,3,4,0,1,2,1,2,0,1)
        self.df.replace(words, values, inplace=True)
        return self.df

    def polirTabela(self):
        self.outilers


    @classmethod #este classmethod funciona como um construtor alternativo e construir um dataframe a partir de um arquivo cs

    def builderData(cls, df, missing_values): 
        try:
            if not isinstance(df, pd.DataFrame):# Handle DataFrame input directly
                df = pd.read_csv(df)
            df = df.copy()# Avoid modifying the original DataFrame
            return cls(df, missing_values)
        except (FileNotFoundError, pd.errors.ParserError):
            # Handle potential errors: file not found or parsing errors
            print(f"Erro: Não conseguiu ler a data de {df}.")
            raise

# <font color = darkblue> 3. Estatísticas descritivas básicas

### <font color = blue> 3.1. Média

In [177]:
data = Dataset.builderData("hcc_dataset.csv", "?")
data = data.df
data.to_csv("hcc_dataset.csv", index=False)
tabela = data.mean(numeric_only=True).to_frame("Média")
display(tabela)

Unnamed: 0,Média
Age,64.690909
Grams_day,71.008547
Packs_year,20.464286
INR,1.421851
AFP,19299.951146
Hemoglobin,12.879012
MCV,95.119753
Leucocytes,1473.961549
Platelets,113206.442654
Albumin,3.445535


### <font color = blue> 3.2. Mediana

In [176]:
data = Dataset.builderData("hcc_dataset.csv", "?")
data = data.df
data.to_csv("hcc_dataset.csv", index=False)
tabela = data.median(numeric_only=True).to_frame("Mediana")
display(tabela)

Unnamed: 0,Mediana
Age,66.0
Grams_day,75.0
Packs_year,0.0
INR,1.3
AFP,33.0
Hemoglobin,13.05
MCV,94.95
Leucocytes,7.2
Platelets,93000.0
Albumin,3.4


### <font color = blue> 3.3. Desvio Padrão

In [26]:
data = Dataset.builderData("hcc_dataset.csv", "?")
data = data.df
data.to_csv("hcc_dataset.csv", index=False)
tabela = data.std(numeric_only=True).to_frame("Desvio Padrão")
display(tabela)

Unnamed: 0,Desvio Padrão
Age,13.319534
Grams_day,76.27768
Packs_year,51.56513
INR,0.477816
AFP,149098.335581
Hemoglobin,2.145237
MCV,8.405846
Leucocytes,2909.106006
Platelets,107118.632481
Albumin,0.685132


### <font color = blue> 3.4. Assimetria

In [28]:
data = Dataset.builderData("hcc_dataset.csv", "?")
data = data.df
data.to_csv("hcc_dataset.csv", index=False)
tabela = data.skew(numeric_only=True).to_frame("Assimetria")
display(tabela)

Unnamed: 0,Assimetria
Age,-0.779988
Grams_day,1.986666
Packs_year,7.886234
INR,3.587974
AFP,11.392308
Hemoglobin,-0.44171
MCV,-0.087315
Leucocytes,1.865177
Platelets,0.95093
Albumin,-0.102843


### <font color = blue> 3.5. Curtose

In [178]:
data = Dataset.builderData("hcc_dataset.csv", "?")
data = data.df
data.to_csv("hcc_dataset.csv", index=False)
tabela = data.kurtosis(numeric_only=True).to_frame("Curtose")
display(tabela)

Unnamed: 0,Curtose
Age,0.84619
Grams_day,7.931784
Packs_year,74.241726
INR,18.965445
AFP,136.01856
Hemoglobin,0.402549
MCV,0.971063
Leucocytes,2.43264
Platelets,0.504059
Albumin,-0.699851


# <font color = darkblue> 4. Relatório DataPrep

In [192]:
def criar_e_exibir_relatorio(ficheiro):
    data_file = pd.read_csv(ficheiro, na_values='?')
    data_file.to_csv(ficheiro, index=False)
    return create_report(data_file).show_browser()
criar_e_exibir_relatorio("hcc_dataset.csv")

  0%|          | 0/7332 [00:00<?, ?it/s]

  return func(*(_execute_task(a, cache) for a in args))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({c

Este bloco de código destina-se à criação de um relatório com base na bibloteca ``dataprep``.
É expectável encontrar informações sobre:

* Percentagem de ``missing values``
* As mais ``importantes estatísticas descritivas`` de todas as variáveis
* Distribuições de dados em ``Q-Q Plot's`` e em ``BoxPlot's``
* ``Interações`` entre 2 variáveis
* ``Correlações`` entre variáveis
* entre outros

# <font color = darkblue> 5. Inputação dos missing values

### <font color = blue> 5.1. Identificação visual dos missing values 

In [22]:
data = Dataset.builderData("hcc_dataset.csv", "?")
data.pintarMissingValues()

Unnamed: 0,Gender,Symptoms,Alcohol,HBsAg,HBeAg,HBcAb,HCVAb,Cirrhosis,Endemic,Smoking,Diabetes,Obesity,Hemochro,AHT,CRI,HIV,NASH,Varices,Spleno,PHT,PVT,Metastasis,Hallmark,Age,Grams_day,Packs_year,PS,Encephalopathy,Ascites,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,Total_Bil,ALT,AST,GGT,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil,Iron,Sat,Ferritin,Class
0,Male,No,Yes,No,No,No,No,Yes,No,Yes,Yes,No,Yes,No,No,No,No,Yes,No,No,No,No,Yes,67,137.0,15.0,Active,,,1.53,95.0,13.7,106.6,4.9,99.0,3.4,2.1,34.0,41.0,183.0,150.0,7.1,0.7,1.0,3.5,0.5,52.5,37.0,856.0,Lives
1,Female,No,No,No,No,No,Yes,Yes,No,No,Yes,No,No,Yes,No,No,No,Yes,No,No,No,No,Yes,62,0.0,,Active,,,,,,,,,,,,,,,,,1.0,1.8,,,,,Lives
2,Male,No,Yes,Yes,No,Yes,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,78,50.0,50.0,Ambulatory,,Mild,0.96,5.8,8.9,79.8,8.4,472.0,3.3,0.4,58.0,68.0,202.0,109.0,7.0,2.1,5.0,13.0,0.1,28.0,6.0,16.0,Lives
3,Male,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,No,No,Yes,No,No,No,No,No,No,No,Yes,Yes,77,40.0,30.0,Active,,,0.95,2440.0,13.4,97.1,9.0,279.0,3.7,0.4,16.0,64.0,94.0,174.0,8.1,1.11,2.0,15.7,0.2,,,,Dies
4,Male,Yes,Yes,Yes,No,Yes,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,76,100.0,30.0,Active,,,0.94,49.0,14.3,95.1,6.4,199.0,4.1,0.7,147.0,306.0,173.0,109.0,6.9,1.8,1.0,9.0,,59.0,15.0,22.0,Lives
5,Male,No,Yes,No,,No,No,Yes,No,,No,Yes,No,No,No,No,No,Yes,Yes,Yes,No,No,Yes,75,,,Restricted,,Mild,1.58,110.0,13.4,91.5,5.4,85.0,3.4,3.5,91.0,122.0,242.0,396.0,5.6,0.9,1.0,10.0,1.4,53.0,22.0,111.0,Dies
6,Male,No,No,No,,Yes,Yes,Yes,No,No,Yes,No,,No,No,No,No,No,No,No,No,No,Yes,49,0.0,0.0,Active,,,1.4,138.9,10.4,102.0,3.2,42000.0,2.35,2.72,119.0,183.0,143.0,211.0,7.3,0.8,5.0,2.6,2.19,171.0,126.0,1452.0,Dies
7,Male,Yes,Yes,No,,No,No,Yes,No,Yes,Yes,,No,No,No,No,No,No,Yes,Yes,Yes,No,Yes,61,,20.0,Selfcare,,,1.46,9860.0,10.8,92.0,3.0,58.0,3.1,3.2,79.0,108.0,184.0,300.0,7.1,0.52,2.0,9.0,1.3,42.0,25.0,706.0,Dies
8,Male,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,No,No,Yes,No,No,No,,Yes,Yes,No,No,Yes,50,100.0,32.0,Restricted,,Mild,3.14,8.8,11.9,107.5,4.9,70.0,1.9,3.3,26.0,59.0,115.0,63.0,6.1,0.59,1.0,6.4,1.2,85.0,73.0,982.0,Lives
9,Male,Yes,Yes,No,No,No,No,Yes,No,No,No,No,Yes,No,No,No,No,No,Yes,No,No,No,No,43,100.0,0.0,Active,,,1.12,1.8,11.8,87.8,5100.0,193000.0,4.2,0.5,71.0,45.0,256.0,303.0,7.1,0.59,1.0,9.3,0.7,,,,Lives


### <font color = blue> 5.2. Heterogeneous Euclidean-Overlap Metric 

In [9]:
data = Dataset.builderData("hcc_dataset.csv", "?")
data.tabelaHEOM()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,155,156,157,158,159,160,161,162,163,164
0,X,X,X,X,X,X,X,X,X,X,...,X,X,X,X,X,X,X,X,X,X
1,5.299683,X,X,X,X,X,X,X,X,X,...,X,X,X,X,X,X,X,X,X,X
2,4.048287,5.915786,X,X,X,X,X,X,X,X,...,X,X,X,X,X,X,X,X,X,X
3,3.245281,5.343068,3.558409,X,X,X,X,X,X,X,...,X,X,X,X,X,X,X,X,X,X
4,3.683806,5.495465,2.698323,3.241517,X,X,X,X,X,X,...,X,X,X,X,X,X,X,X,X,X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,4.379363,5.721385,4.636164,4.516116,4.314898,4.862136,4.360479,4.826378,4.910478,4.269307,...,5.068029,4.228015,4.727487,5.03952,5.009368,X,X,X,X,X
161,4.502504,5.657485,4.37538,4.19943,4.315775,4.268535,4.232149,4.26435,4.084648,4.218532,...,4.422596,4.287435,4.180138,4.412553,4.134199,4.853423,X,X,X,X
162,3.888074,5.771767,4.132832,3.896931,3.986662,4.441792,4.566333,4.305891,4.099607,4.280652,...,4.667334,4.785305,4.131191,4.281219,4.880023,4.808943,5.022344,X,X,X
163,4.773466,6.190742,4.322748,4.499781,4.572141,4.776376,4.668574,4.648151,4.933664,4.837719,...,4.583438,4.330263,4.759901,4.999361,4.448485,4.568358,4.938729,4.901204,X,X


### <font color = blue> 5.3 Inputação dos missing values por HEOM

In [25]:
data = Dataset.builderData("hcc_dataset.csv", "?")
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
data.tratamentoMissingValues()

Unnamed: 0,Gender,Symptoms,Alcohol,HBsAg,HBeAg,HBcAb,HCVAb,Cirrhosis,Endemic,Smoking,Diabetes,Obesity,Hemochro,AHT,CRI,HIV,NASH,Varices,Spleno,PHT,PVT,Metastasis,Hallmark,Age,Grams_day,Packs_year,PS,Encephalopathy,Ascites,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,Total_Bil,ALT,AST,GGT,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil,Class
0,Male,No,Yes,No,No,No,No,Yes,No,Yes,Yes,No,Yes,No,No,No,No,Yes,No,No,No,No,Yes,67,137.0,15.0,Active,,,1.53,95.0,13.7,106.6,4.9,99.0,3.4,2.1,34.0,41.0,183.0,150.0,7.1,0.7,1.0,3.5,0.5,Lives
1,Female,No,No,No,No,No,Yes,Yes,No,No,Yes,No,No,Yes,No,No,No,Yes,No,No,No,No,Yes,62,0.0,0.0,Active,,,1.566667,206.9,10.933333,95.166667,1770.566667,55667.236667,3.44,1.633333,65.333333,67.666667,286.666667,290.333333,6.633333,0.696667,1.0,1.8,0.75,Lives
2,Male,No,Yes,Yes,No,Yes,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,78,50.0,50.0,Ambulatory,,Mild,0.96,5.8,8.9,79.8,8.4,472.0,3.3,0.4,58.0,68.0,202.0,109.0,7.0,2.1,5.0,13.0,0.1,Lives
3,Male,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,No,No,Yes,No,No,No,No,No,No,No,Yes,Yes,77,40.0,30.0,Active,,,0.95,2440.0,13.4,97.1,9.0,279.0,3.7,0.4,16.0,64.0,94.0,174.0,8.1,1.11,2.0,15.7,0.2,Dies
4,Male,Yes,Yes,Yes,No,Yes,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,76,100.0,30.0,Active,,,0.94,49.0,14.3,95.1,6.4,199.0,4.1,0.7,147.0,306.0,173.0,109.0,6.9,1.8,1.0,9.0,0.15,Lives
5,Male,No,Yes,No,No,No,No,Yes,No,No,No,Yes,No,No,No,No,No,Yes,Yes,Yes,No,No,Yes,75,86.666667,33.75,Restricted,,Mild,1.58,110.0,13.4,91.5,5.4,85.0,3.4,3.5,91.0,122.0,242.0,396.0,5.6,0.9,1.0,10.0,1.4,Dies
6,Male,No,No,No,No,Yes,Yes,Yes,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Yes,49,0.0,0.0,Active,,,1.4,138.9,10.4,102.0,3.2,42000.0,2.35,2.72,119.0,183.0,143.0,211.0,7.3,0.8,5.0,2.6,2.19,Dies
7,Male,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,No,No,No,No,No,No,No,Yes,Yes,Yes,No,Yes,61,50.0,20.0,Selfcare,,,1.46,9860.0,10.8,92.0,3.0,58.0,3.1,3.2,79.0,108.0,184.0,300.0,7.1,0.52,2.0,9.0,1.3,Dies
8,Male,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,No,No,Yes,No,No,No,Yes,Yes,Yes,No,No,Yes,50,100.0,32.0,Restricted,,Mild,3.14,8.8,11.9,107.5,4.9,70.0,1.9,3.3,26.0,59.0,115.0,63.0,6.1,0.59,1.0,6.4,1.2,Lives
9,Male,Yes,Yes,No,No,No,No,Yes,No,No,No,No,Yes,No,No,No,No,No,Yes,No,No,No,No,43,100.0,0.0,Active,,,1.12,1.8,11.8,87.8,5100.0,193000.0,4.2,0.5,71.0,45.0,256.0,303.0,7.1,0.59,1.0,9.3,0.7,Lives


# <font color = darkblue> 6. Ajuste dos outliers

### <font color = blue> 6.1.Identificação visual dos outliers

In [6]:
data = Dataset.builderData("hcc_dataset.csv", "?")
display(data.outliers('style'))

Unnamed: 0,Gender,Symptoms,Alcohol,HBsAg,HBeAg,HBcAb,HCVAb,Cirrhosis,Endemic,Smoking,Diabetes,Obesity,Hemochro,AHT,CRI,HIV,NASH,Varices,Spleno,PHT,PVT,Metastasis,Hallmark,Age,Grams_day,Packs_year,PS,Encephalopathy,Ascites,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,Total_Bil,ALT,AST,GGT,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil,Iron,Sat,Ferritin,Class
0,Male,No,Yes,No,No,No,No,Yes,No,Yes,Yes,No,Yes,No,No,No,No,Yes,No,No,No,No,Yes,67,137.0,15.0,Active,,,1.53,95.0,13.7,106.6,4.9,99.0,3.4,2.1,34.0,41.0,183.0,150.0,7.1,0.7,1.0,3.5,0.5,52.5,37.0,856.0,Lives
1,Female,No,No,No,No,No,Yes,Yes,No,No,Yes,No,No,Yes,No,No,No,Yes,No,No,No,No,Yes,62,0.0,,Active,,,,,,,,,,,,,,,,,1.0,1.8,,,,,Lives
2,Male,No,Yes,Yes,No,Yes,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,78,50.0,50.0,Ambulatory,,Mild,0.96,5.8,8.9,79.8,8.4,472.0,3.3,0.4,58.0,68.0,202.0,109.0,7.0,2.1,5.0,13.0,0.1,28.0,6.0,16.0,Lives
3,Male,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,No,No,Yes,No,No,No,No,No,No,No,Yes,Yes,77,40.0,30.0,Active,,,0.95,2440.0,13.4,97.1,9.0,279.0,3.7,0.4,16.0,64.0,94.0,174.0,8.1,1.11,2.0,15.7,0.2,,,,Dies
4,Male,Yes,Yes,Yes,No,Yes,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,76,100.0,30.0,Active,,,0.94,49.0,14.3,95.1,6.4,199.0,4.1,0.7,147.0,306.0,173.0,109.0,6.9,1.8,1.0,9.0,,59.0,15.0,22.0,Lives
5,Male,No,Yes,No,,No,No,Yes,No,,No,Yes,No,No,No,No,No,Yes,Yes,Yes,No,No,Yes,75,,,Restricted,,Mild,1.58,110.0,13.4,91.5,5.4,85.0,3.4,3.5,91.0,122.0,242.0,396.0,5.6,0.9,1.0,10.0,1.4,53.0,22.0,111.0,Dies
6,Male,No,No,No,,Yes,Yes,Yes,No,No,Yes,No,,No,No,No,No,No,No,No,No,No,Yes,49,0.0,0.0,Active,,,1.4,138.9,10.4,102.0,3.2,42000.0,2.35,2.72,119.0,183.0,143.0,211.0,7.3,0.8,5.0,2.6,2.19,171.0,126.0,1452.0,Dies
7,Male,Yes,Yes,No,,No,No,Yes,No,Yes,Yes,,No,No,No,No,No,No,Yes,Yes,Yes,No,Yes,61,,20.0,Selfcare,,,1.46,9860.0,10.8,92.0,3.0,58.0,3.1,3.2,79.0,108.0,184.0,300.0,7.1,0.52,2.0,9.0,1.3,42.0,25.0,706.0,Dies
8,Male,Yes,Yes,No,No,No,No,Yes,No,Yes,Yes,No,No,Yes,No,No,No,,Yes,Yes,No,No,Yes,50,100.0,32.0,Restricted,,Mild,3.14,8.8,11.9,107.5,4.9,70.0,1.9,3.3,26.0,59.0,115.0,63.0,6.1,0.59,1.0,6.4,1.2,85.0,73.0,982.0,Lives
9,Male,Yes,Yes,No,No,No,No,Yes,No,No,No,No,Yes,No,No,No,No,No,Yes,No,No,No,No,43,100.0,0.0,Active,,,1.12,1.8,11.8,87.8,5100.0,193000.0,4.2,0.5,71.0,45.0,256.0,303.0,7.1,0.59,1.0,9.3,0.7,,,,Lives


In [15]:
data = Dataset.builderData("hcc_dataset.csv", "?")
display(data.outliers('index'))

ValueError: could not convert string to float: 'Male'

In [74]:
data = Dataset.builderData("hcc_dataset.csv", "?")
display(data.polirTabela())

Unnamed: 0,Age,Grams_day,Packs_year,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,Total_Bil,ALT,AST,GGT,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil
0,67,137.0,15.0,1.53,95.0,13.7,106.6,4.9,99.0,3.4,2.1,34.0,41.0,183.0,150.0,7.1,0.7,1.0,3.5,0.5
1,62,0.0,0.0,1.186667,1637.933,14.6,93.466667,8.166667,94713.666667,3.733333,1.9,185.333333,267.333333,153.0,144.0,6.8,0.966667,1.0,1.8,0.966667
2,78,50.0,50.0,0.96,5.8,8.9,79.8,8.4,472.0,3.3,0.4,58.0,68.0,202.0,109.0,7.0,2.1,5.0,13.0,0.1
3,77,40.0,30.0,0.95,2440.0,13.4,97.1,9.0,279.0,3.7,0.4,16.0,64.0,94.0,174.0,8.1,1.11,2.0,15.7,0.2
4,76,100.0,30.0,0.94,49.0,14.3,95.1,6.4,199.0,4.1,0.7,147.0,306.0,173.0,109.0,6.9,1.8,1.0,9.0,0.35
5,75,30.0,29.166667,1.58,110.0,13.4,91.5,5.4,85.0,3.4,3.5,91.0,122.0,242.0,396.0,5.6,0.9,1.0,10.0,1.4
6,49,0.0,0.0,1.4,138.9,10.4,102.0,3.2,42000.0,2.35,2.72,119.0,183.0,143.0,211.0,7.3,0.8,5.0,2.6,2.19
7,61,53.333333,20.0,1.46,9860.0,10.8,92.0,3.0,58.0,3.1,3.2,79.0,108.0,184.0,300.0,7.1,0.52,2.0,9.0,1.3
8,50,100.0,32.0,3.14,8.8,11.9,107.5,4.9,70.0,1.9,3.3,26.0,59.0,115.0,63.0,6.1,0.59,1.0,6.4,1.2
9,43,100.0,0.0,1.12,1.8,11.8,87.8,5100.0,193000.0,4.2,0.5,71.0,45.0,256.0,303.0,7.1,0.59,1.0,9.3,0.7


In [13]:
data = Dataset.builderData("hcc_dataset.csv", "?")


Unnamed: 0,Age,Grams_day,Packs_year,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,Total_Bil,ALT,AST,GGT,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil,Iron,Sat,Ferritin
0,67,137.0,15.0,1.53,95.0,13.7,106.6,4.9,99.0,3.4,2.1,34.0,41.0,183.0,150.0,7.1,0.7,1.0,3.5,0.5,,,
1,62,0.0,,,,,,,,,,,,,,,,1.0,1.8,,,,
2,78,50.0,50.0,0.96,5.8,8.9,79.8,8.4,472.0,3.3,0.4,58.0,68.0,202.0,109.0,7.0,2.1,5.0,13.0,0.1,28.0,6.0,16.0
3,77,40.0,30.0,0.95,2440.0,13.4,97.1,9.0,279.0,3.7,0.4,16.0,64.0,94.0,174.0,8.1,1.11,2.0,15.7,0.2,,,
4,76,100.0,30.0,0.94,49.0,14.3,95.1,6.4,199.0,4.1,0.7,147.0,306.0,173.0,109.0,6.9,1.8,1.0,9.0,,59.0,15.0,22.0
5,75,,,1.58,110.0,13.4,91.5,5.4,85.0,3.4,3.5,91.0,122.0,242.0,396.0,5.6,0.9,1.0,10.0,1.4,53.0,22.0,111.0
6,49,0.0,0.0,1.4,138.9,10.4,102.0,3.2,42000.0,2.35,2.72,119.0,183.0,143.0,211.0,7.3,0.8,5.0,2.6,2.19,171.0,126.0,1452.0
7,61,,20.0,1.46,9860.0,10.8,92.0,3.0,58.0,3.1,3.2,79.0,108.0,184.0,300.0,7.1,0.52,2.0,9.0,1.3,42.0,25.0,706.0
8,50,100.0,32.0,3.14,8.8,11.9,107.5,4.9,70.0,1.9,3.3,26.0,59.0,115.0,63.0,6.1,0.59,1.0,6.4,1.2,85.0,73.0,982.0
9,43,100.0,0.0,1.12,1.8,11.8,87.8,5100.0,193000.0,4.2,0.5,71.0,45.0,256.0,303.0,7.1,0.59,1.0,9.3,0.7,,,


# LOGISTIC REGRESSION