<a href="https://colab.research.google.com/github/CrAvila/IA/blob/main/Taller1/IA_Taller_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Procesamiento de Datos

In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import random
from itertools import combinations

In [3]:
data = pd.read_csv('https://drive.google.com/uc?export=download&id=1NYlG6ZYmh-TdHgEuzTz1K-yfFxIoFEUC')

variables_to_use = [
    'age',
    'avg_glucose_level',
    'bmi',
    'hypertension',
    'heart_disease',
]

In [4]:
display(data)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
events = {
    "quantitative" : {

        "age": {
            "ranges" : [
                (0,18),
                (18,65),
                (65, float('inf'))
            ],

            "categories" : [
                'Child',
                'Adult',
                'Senior'
            ]
        },

        "avg_glucose_level": {
            "ranges" : [
                (0,80),
                (80,120),
                (120, float('inf'))
            ],

            "categories" : [
                'Low',
                'Normal',
                'High'
            ]
        },

        "bmi": {
            "ranges": [
                (0,18.5),
                (18.5, 25),
                (25, float('inf'))
            ],

            "categories": [
                'Underweight',
                'Normal',
                'Overweight',
                'Unknown'
            ]
        }
    },

    "qualitative" :  {
        "gender" : data['gender'].unique().tolist(),
        "work_type": data['work_type'].unique().tolist(),
        "Residence_type" : data['Residence_type'].unique().tolist(),
        "smoking_status" : data['smoking_status'].unique().tolist()
    },

    "boolean" : {
        "hypertension" : {
            "statuses" : [0,1]
        },

        "heart_disease" : {
            "statuses" : [0,1]
        },

        "ever_married" : {
            "statuses" : [0,1]
        } ,

        "stroke" : {
            "statuses" : [0,1]
        }
    }
}

In [6]:
#Function to process original data
def process_data(df, variables_to_use):

    processed_dataframe = df.copy()

    #Classify que quantitative values
    for quantitative_column in list(events['quantitative'].keys()):

        #Specific function to classify the column value
        def categorize(value):
            if isinstance(value, int) or isinstance(value, float):
                for i, (lower, upper) in enumerate(events["quantitative"][quantitative_column]["ranges"]):
                    if lower <= value < upper:
                        return events["quantitative"][quantitative_column]["categories"][i]
            else:
                return value

        # Replace value into the processed dataframe
        processed_dataframe[quantitative_column] = processed_dataframe[quantitative_column].apply(categorize)

    replace_mapping = {
        "No" : 0,
        "Yes" : 1
    }

    processed_dataframe['ever_married'] = processed_dataframe['ever_married'].replace(replace_mapping)

    new_columns = [col for col in variables_to_use]
    new_columns.append('stroke')

    cols_to_drop = [col for col in data.columns.values.tolist() if col not in new_columns]
    cols_to_drop = [col for col in cols_to_drop if col in processed_dataframe.columns.tolist()]

    processed_dataframe = processed_dataframe.drop(columns=cols_to_drop)
    processed_dataframe.fillna('Unknown', inplace=True)

    processed_dataframe = processed_dataframe[new_columns]

    return processed_dataframe

In [7]:
processed_data = process_data(data, variables_to_use)

display(processed_data)

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
0,Senior,High,Overweight,0,1,1
1,Adult,High,Unknown,0,0,1
2,Senior,Normal,Overweight,0,1,1
3,Adult,High,Overweight,0,0,1
4,Senior,High,Normal,1,0,1
...,...,...,...,...,...,...
5105,Senior,Normal,Unknown,1,0,0
5106,Senior,High,Overweight,0,0,0
5107,Adult,Normal,Overweight,0,0,0
5108,Adult,High,Overweight,0,0,0


In [8]:
def split_data(df, test_percentage):
    # Separate data into stroke and no stroke groups
    stroke_group = df[df['stroke'] == 1]
    no_stroke_group = df[df['stroke'] == 0]

    # Shuffle records in both groups
    stroke_group_shuffled = stroke_group.sample(frac=1, random_state=42).reset_index(drop=True)
    no_stroke_group_shuffled = no_stroke_group.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate the number of records needed for 20 %
    num_test_stroke = int(test_percentage * len(stroke_group_shuffled))
    num_test_no_stroke = int(test_percentage * len(no_stroke_group_shuffled))

    # Create test and training sets for the no stroke group
    test_set_no_stroke = no_stroke_group_shuffled[:num_test_no_stroke]
    train_set_no_stroke = no_stroke_group_shuffled[num_test_no_stroke:]

    # Create test and training set for the stroke group
    test_set_stroke = stroke_group_shuffled[:num_test_stroke]
    train_set_stroke = stroke_group_shuffled[num_test_stroke:]

    # Combine test sets
    final_test_set = pd.concat([test_set_stroke, test_set_no_stroke])

    # Combine train sets
    final_train_set = pd.concat([train_set_stroke, train_set_no_stroke])

    return final_test_set, final_train_set

In [9]:
test_set, train_set = split_data(processed_data, 0.2)

In [10]:
display(test_set)
display(train_set)

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
0,Senior,High,Overweight,0,0,1
1,Senior,Low,Overweight,1,1,1
2,Adult,High,Overweight,0,1,1
3,Adult,High,Overweight,0,0,1
4,Senior,Normal,Unknown,0,0,1
...,...,...,...,...,...,...
967,Adult,Normal,Overweight,0,0,0
968,Child,Normal,Normal,0,0,0
969,Adult,Normal,Underweight,0,0,0
970,Adult,High,Overweight,0,0,0


Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
49,Adult,Normal,Overweight,0,0,1
50,Senior,High,Overweight,1,0,1
51,Adult,Normal,Overweight,0,0,1
52,Adult,Low,Normal,0,0,1
53,Senior,Normal,Overweight,0,0,1
...,...,...,...,...,...,...
4856,Child,Low,Normal,0,0,0
4857,Adult,Normal,Overweight,1,0,0
4858,Adult,High,Overweight,0,0,0
4859,Child,Normal,Underweight,0,0,0


## Construcción del Modelo



In [11]:
from pandas.io.formats.info import DataFrameInfo
def build_tree(data):

    count_dict = {}
    prob_dict = {}
    
    unique_values_dict = {
        column: data[column].unique().tolist() for column in data.columns
    }

    def calculate_probabilities(df, values_dict, conditions=(), index=0, accumulated_prob=1.0, prob_dict={}):

        if index == len(categories):  # Stop when we reach the last category
            return

        current_category = categories[index]
        current_values = values_dict[current_category]

        for value in current_values:
            new_conditions = conditions + ((current_category, value),)

            filtered_df = df
            for condition in new_conditions:
                column, val = condition
                filtered_df = filtered_df[filtered_df[column] == val]

            count = len(filtered_df)

            prob = count / (len(df) or 1)
            # Calculate the probability for this branch
            branch_prob = accumulated_prob * prob

            if index not in prob_dict:
                prob_dict[index] = {}
                count_dict[index] = {}

            # Store the branch probability
            prob_dict[index][new_conditions] = branch_prob
            count_dict[index][new_conditions] = count

            next_index = index + 1
            calculate_probabilities(filtered_df, values_dict, new_conditions, next_index, accumulated_prob, prob_dict)


    categories = list(unique_values_dict.keys())
    calculate_probabilities(data, unique_values_dict, prob_dict=prob_dict)

    return count_dict, prob_dict

In [12]:
# Calculate counts and probabilities as a tree
count_tree, probability_tree = build_tree(train_set)

In [13]:
def format_condition(condition, type):
    abbreviated_conditions = [f"{str(col)[0].capitalize()}{str(val)[0]}" for col, val in condition]

    formatted_condition = ""

    if type == "conditional":
        if len(abbreviated_conditions) > 1:
            event = abbreviated_conditions[-1]
            previous_events = abbreviated_conditions[:-1]

            formatted_condition = f"{event} | {' ^ '.join([condition for condition in previous_events])}"
        else:
            formatted_condition = abbreviated_conditions[0]
    else:
        formatted_condition = f"{' ^ '.join([condition for condition in abbreviated_conditions])}"

    return formatted_condition

def print_dict_as_tree(probs, counts):
    for (key, value), (key2, value2) in zip(probs.items(), counts.items()):
        if isinstance(key, tuple):
            formatted_key = format_condition(key, "conditional")
        else:
            formatted_key = f"\nTree Height: {key}\n"

        if isinstance(value, dict):
            print(formatted_key)
            print_dict_as_tree(value, value2)
        else:
            p = value2/len(train_set)
            formatted_intersection = format_condition(key, "intersection")
            conditional_string = f"P({formatted_key}) = {'{0:.10f}'.format(value)}"
            intersection_string = f"P({formatted_intersection}) = {'{0:.10f}'.format(p)}"
            print(f"{conditional_string} -> {intersection_string}")

print_dict_as_tree(probability_tree, count_tree)


Tree Height: 0

P(AA) = 0.6255808266 -> P(AA) = 0.6255808266
P(AS) = 0.2029836146 -> P(AS) = 0.2029836146
P(AC) = 0.1714355588 -> P(AC) = 0.1714355588

Tree Height: 1

P(AN | AA) = 0.4933541830 -> P(AA ^ AN) = 0.3086329176
P(AH | AA) = 0.1993745113 -> P(AA ^ AH) = 0.1247248716
P(AL | AA) = 0.3072713057 -> P(AA ^ AL) = 0.1922230374
P(AN | AS) = 0.4096385542 -> P(AS ^ AN) = 0.0831499144
P(AH | AS) = 0.3301204819 -> P(AS ^ AH) = 0.0670090487
P(AL | AS) = 0.2602409639 -> P(AS ^ AL) = 0.0528246515
P(AN | AC) = 0.5278174037 -> P(AC ^ AN) = 0.0904866716
P(AH | AC) = 0.1540656205 -> P(AC ^ AH) = 0.0264123258
P(AL | AC) = 0.3181169757 -> P(AC ^ AL) = 0.0545365615

Tree Height: 2

P(BO | AA ^ AN) = 0.7329635499 -> P(AA ^ AN ^ BO) = 0.2262166789
P(BN | AA ^ AN) = 0.2297939778 -> P(AA ^ AN ^ BN) = 0.0709219858
P(BU | AA ^ AN) = 0.0309033281 -> P(AA ^ AN ^ BU) = 0.0095377843
P(BU | AA ^ AN) = 0.0063391442 -> P(AA ^ AN ^ BU) = 0.0019564686
P(BO | AA ^ AH) = 0.7666666667 -> P(AA ^ AH ^ BO) = 0.09562

## Evaluación del Modelo

Ahora que ya tenemos las probabilidades de todas las ramas posibles y todas las intersecciones posibles, nos interesa concretamente el nivel de altura $5$ del árbol de probabilidades, ya que ahí se encuentra la probabilidad de tener un accidente cerebrovascular dadas ya ciertas combinaciones de variables. Ahora, hay que obtener estas probabilidades introduciendo únicamente los valores de las variables a evaluar.

Al ingresar los datos de la siguiente manera, obtenemos la probabilidad condicional correspondiente a las variables de entrada:

In [14]:
try:
    print(probability_tree[5][(
    ('age', 'Child'),
    ('avg_glucose_level', 'Low'),
    ('bmi', 'Overweight'),
    ('hypertension', 0),
    ('heart_disease', 0),
    ('stroke', 1)
    )])
except:
    print('Other variables used.')

0.023809523809523808


Por lo que podemos construir una función que retorne directamente los resultados al ingresar las variables:

In [15]:
def get_probability(model, variables, input_data, events):

    # Categorize the quantitative variables
    def get_from_tree(variable, value, events):
        if isinstance(value, str):
            return value

        if variable in events['quantitative']:
            ranges = events['quantitative'][variable]['ranges']
            categories = events['quantitative'][variable]['categories']
            for i, (lower, upper) in enumerate(ranges):
                if lower <= value < upper:
                    return categories[i]
        return value

    # Create the tuple of tuples for the input
    categorized_input = []
    for variable, value in zip(variables, input_data):
        categorized_value = get_from_tree(variable, value, events)
        categorized_input.append(categorized_value)

    input_tuple = tuple((variable, value) for variable, value in zip(variables,categorized_input))

    # Specific for this model
    key = input_tuple + (('stroke', 1),)

    # Extract probability
    probability = model[len(variables)][key]

    return probability


In [16]:
try:
    print(get_probability(probability_tree, variables_to_use, [10,70,30, 0, 0], events))
except:
    print("Other variables used.")

0.023809523809523808


De esta manera obtenemos la probabilidad según las variables de entrada (que pueden ser cualquiera de las que estén definidas en la lista de variables a utilizar y en el mismo orden) ya que si son numéricas se categorizan y luego se extrae la probabilidad del árbol. Sin embargo, lo que nos interesa es llegar a categorizar este resultado, ya que queremos obtener una respuesta cerrada que indique si la persona con dichos datos es entrada está en riesgo o no.

In [17]:
def get_stroke_risk(model, variables, input_data):
    probability = get_probability(model, variables, input_data, events)
    rgn = random()

    risk = 0

    if rgn < probability or probability > 0.1:
        risk = 1

    #print(f"P = {probability} \ R = {rgn} -> Risk: {risk}")
    return risk

In [18]:
try:
    print(get_stroke_risk(probability_tree, variables_to_use, [10,70,30, 0, 0]))
except:
    print("Other variables used.")

0


Ahora, debemos hacer esto con todas las entradas del conjunto de pruebas y evaluar la exactituda del modelo.

In [19]:
def evaluate_model(model, variables, test_set):

    correct_predictions = 0
    correct_stroke_predictions = 0
    correct_no_stroke_predictions = 0

    for index, row in test_set.iterrows():

        stroke_risk = get_stroke_risk(model, variables, [row[var] for var in variables])
        real_value = row['stroke']

        if real_value == stroke_risk:
            correct_predictions += 1

            if real_value == 1:
                correct_stroke_predictions += 1
            else:
                correct_no_stroke_predictions += 1

        #print(f"Real Value: {real_value} Prediction: {stroke_risk}")

    general_accuracy = correct_predictions/test_set.shape[0] * 100
    stroke_accuracy = correct_stroke_predictions/test_set[test_set["stroke"] == 1].shape[0] * 100
    no_stroke_accuracy = correct_no_stroke_predictions/test_set[test_set["stroke"] == 0].shape[0] * 100

    return general_accuracy, stroke_accuracy, no_stroke_accuracy

Ahora podemos comprobar la exactituda del modelo en una evaluación del conjunto de pruebas: 

In [20]:
evaluate_model(probability_tree, variables_to_use, test_set)

(85.1126346718903, 59.183673469387756, 86.41975308641975)

Y realizar más pruebas para ver en promedio cual es la exactitud del modelo:

In [21]:
def multiple_tests(model, n, test_set, variables):
    accuracy_dict = {
        "correct_predictions" : [],
        "correct_stroke_predictions" : [],
        "correct_no_stroke_predictions" : []
    }

    
    for _ in range(n):
        ga, sa, nsa = evaluate_model(model, variables, test_set)
        accuracy_dict["correct_predictions"].append(ga)
        accuracy_dict["correct_stroke_predictions"].append(sa)
        accuracy_dict["correct_no_stroke_predictions"].append(nsa)

    # Calculate averages
    ga_avg = sum(accuracy_dict["correct_predictions"])/len(accuracy_dict["correct_predictions"])
    sa_avg = sum(accuracy_dict["correct_stroke_predictions"])/len(accuracy_dict["correct_stroke_predictions"])
    nsa_avg = sum(accuracy_dict["correct_no_stroke_predictions"]) / len(accuracy_dict["correct_no_stroke_predictions"])

    return ga_avg, sa_avg, nsa_avg


multiple_tests(probability_tree, 100, test_set, variables_to_use)

(85.18021547502448, 55.20408163265312, 86.69135802469134)

Aquí podemos observar el resultado en el formato (exactitud general, exactitud stroke, exactitud no stroke) y observamos que en general, se tiene una buena exactitud general. Sin embargo, 

In [22]:
def build_model(variables, train_set):
    proc_data = process_data(train_set, variables)

    count_tree, probability_tree = build_tree(proc_data)
    
    #print_dict_as_tree(probability_tree, count_tree)

    return probability_tree

In [23]:
def get_optimal_variables():
    variable_to_predict = "stroke"

    l = [item for item in data if item != variable_to_predict and item != 'id']

    variable_combinations = list(combinations(l, 5))
    
    max_combination = {
        "combo" : [],
        "accuracy" : 0,
        "stroke_accuracy" : 0,
        "no_stroke_accuracy" : 0
    }
    
    for comb in variable_combinations:
        generic_data = process_data(data, l)

        #display(generic_data)

        test_set, train_set = split_data(generic_data, 0.2)
        model = build_model(comb, train_set)
        #print(f"Combination of variables: {comb}")
        #ga, sa, na = (multiple_tests(model, 100, test_set, comb))
        ga, sa, na = (evaluate_model(model, comb, test_set))
    
        
        if sa > max_combination['stroke_accuracy']:
            max_combination['stroke_accuracy'] = sa
            max_combination['accuracy'] = ga
            max_combination['no_stroke_accuracy'] = na
            max_combination['combo'] = comb
    
    print("Max Combination:")
    for key, value in max_combination.items():
        if key == "combo":
            print(f"{key.capitalize()}: {value}")
        else:
            print(f"{key.replace('_', ' ').capitalize()}: {value:.3f}")

In [24]:
get_optimal_variables()



Max Combination:
Combo: ('age', 'hypertension', 'heart_disease', 'ever_married', 'bmi')
Accuracy: 79.824
Stroke accuracy: 69.388
No stroke accuracy: 80.350
