<a href="https://colab.research.google.com/github/CrAvila/IA/blob/main/Taller1/IA_Taller_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [230]:
!pip install pandas
!pip install numpy
!pip install matplotlib



In [231]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [232]:
data = pd.read_csv('drive/MyDrive/colab/healthcare-dataset-stroke-data.csv')

variables_to_use = [
    'age',
    'avg_glucose_level',
    'bmi',
    'hypertension',
    'heart_disease',
]

In [233]:
display(data)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [234]:
events = {
    "quantitative" : {

        "age": {
            "ranges" : [
                (0,18),
                (18,65),
                (65, float('inf'))
            ],

            "categories" : [
                'Child',
                'Adult',
                'Senior'
            ]
        },

        "avg_glucose_level": {
            "ranges" : [
                (0,80),
                (80,120),
                (120, float('inf'))
            ],

            "categories" : [
                'Low',
                'Normal',
                'High'
            ]
        },

        "bmi": {
            "ranges": [
                (0,18.5),
                (18.5, 25),
                (25, float('inf'))
            ],

            "categories": [
                'Underweight',
                'Normal',
                'Overweight',
                'Unknown'
            ]
        }
    },

    "qualitative" :  {
        "gender" : data['gender'].unique().tolist(),
        "work_type": data['work_type'].unique().tolist(),
        "Residence_type" : data['Residence_type'].unique().tolist(),
        "smoking_status" : data['smoking_status'].unique().tolist()
    },

    "boolean" : {
        "hypertension" : {
            "statuses" : [0,1]
        },

        "heart_disease" : {
            "statuses" : [0,1]
        },

        "ever_married" : {
            "statuses" : [0,1]
        } ,

        "stroke" : {
            "statuses" : [0,1]
        }
    }
}

In [235]:
#Function to process original data
def process_data(df, variables_to_use):

    processed_dataframe = df.copy()

    #Classify que quantitative values
    for quantitative_column in list(events['quantitative'].keys()):

        #Specific function to classify the column value
        def categorize(value):
            for i, (lower, upper) in enumerate(events["quantitative"][quantitative_column]["ranges"]):
                if lower <= value < upper:
                    return events["quantitative"][quantitative_column]["categories"][i]

        # Replace value into the processed dataframe
        processed_dataframe[quantitative_column] = processed_dataframe[quantitative_column].apply(categorize)

    replace_mapping = {
        "No" : 0,
        "Yes" : 1
    }

    processed_dataframe['ever_married'] = processed_dataframe['ever_married'].replace(replace_mapping)

    new_columns = [col for col in variables_to_use]
    new_columns.append('stroke')

    cols_to_drop = [col for col in data.columns.values.tolist() if col not in new_columns]

    processed_dataframe = processed_dataframe.drop(columns=cols_to_drop)
    processed_dataframe.fillna('Unknown', inplace=True)

    processed_dataframe = processed_dataframe[new_columns]

    return processed_dataframe

In [236]:
processed_data = process_data(data, variables_to_use)

display(processed_data)

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
0,Senior,High,Overweight,0,1,1
1,Adult,High,Unknown,0,0,1
2,Senior,Normal,Overweight,0,1,1
3,Adult,High,Overweight,0,0,1
4,Senior,High,Normal,1,0,1
...,...,...,...,...,...,...
5105,Senior,Normal,Unknown,1,0,0
5106,Senior,High,Overweight,0,0,0
5107,Adult,Normal,Overweight,0,0,0
5108,Adult,High,Overweight,0,0,0


In [237]:
def split_data(df, test_percentage):
    # Separate data into stroke and no stroke groups
    stroke_group = df[df['stroke'] == 1]
    no_stroke_group = df[df['stroke'] == 0]

    # Shuffle records in both groups
    stroke_group_shuffled = stroke_group.sample(frac=1, random_state=42).reset_index(drop=True)
    no_stroke_group_shuffled = no_stroke_group.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate the number of records needed for 20 %
    num_test_stroke = int(test_percentage * len(stroke_group_shuffled))
    num_test_no_stroke = int(test_percentage * len(no_stroke_group_shuffled))

    # Create test and training sets for the no stroke group
    test_set_no_stroke = no_stroke_group_shuffled[:num_test_no_stroke]
    train_set_no_stroke = no_stroke_group_shuffled[num_test_no_stroke:]

    # Create test and training set for the stroke group
    test_set_stroke = stroke_group_shuffled[:num_test_stroke]
    train_set_stroke = stroke_group_shuffled[num_test_stroke:]

    # Combine test sets
    final_test_set = pd.concat([test_set_stroke, test_set_no_stroke])

    # Combine train sets
    final_train_set = pd.concat([train_set_stroke, train_set_no_stroke])

    return final_test_set, final_train_set

In [238]:
test_set, train_set = split_data(processed_data, 0.2)

In [239]:
display(test_set)
display(train_set)

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
0,Senior,High,Overweight,0,0,1
1,Senior,Low,Overweight,1,1,1
2,Adult,High,Overweight,0,1,1
3,Adult,High,Overweight,0,0,1
4,Senior,Normal,Unknown,0,0,1
...,...,...,...,...,...,...
967,Adult,Normal,Overweight,0,0,0
968,Child,Normal,Normal,0,0,0
969,Adult,Normal,Underweight,0,0,0
970,Adult,High,Overweight,0,0,0


Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
49,Adult,Normal,Overweight,0,0,1
50,Senior,High,Overweight,1,0,1
51,Adult,Normal,Overweight,0,0,1
52,Adult,Low,Normal,0,0,1
53,Senior,Normal,Overweight,0,0,1
...,...,...,...,...,...,...
4856,Child,Low,Normal,0,0,0
4857,Adult,Normal,Overweight,1,0,0
4858,Adult,High,Overweight,0,0,0
4859,Child,Normal,Underweight,0,0,0


In [240]:
unique_values_dict = {
    column: processed_data[column].unique().tolist() for column in processed_data.columns
}




In [241]:
def calculate_probabilities(df, values_dict, conditions=(), index=0, accumulated_prob=1.0, prob_dict={}):

    if index == len(categories):  # Stop when you reach the last category
        return

    current_category = categories[index]
    current_values = values_dict[current_category]

    for value in current_values:
        new_conditions = conditions + ((current_category, value),)

        filtered_df = df
        for condition in new_conditions:
            column, val = condition
            filtered_df = filtered_df[filtered_df[column] == val]

        count = len(filtered_df)

        prob = count / (len(df) or 1)
        branch_prob = accumulated_prob * prob  # Calculate the probability for this branch

        if index not in prob_dict:
            prob_dict[index] = {}
        prob_dict[index][new_conditions] = branch_prob  # Store the branch probability

        next_index = index + 1
        calculate_probabilities(filtered_df, values_dict, new_conditions, next_index, accumulated_prob, prob_dict)


def print_probabilities(prob_dict):
    for level, level_probs in prob_dict.items():
        print(f"Level {level} Probabilities:")
        for conditions, probability in level_probs.items():
            conditions_dict = dict(conditions)
            conditions_str = ', '.join([f"{col}: {val}" for col, val in conditions])
            print(f"{conditions_str} P: {probability}")


# Calculate counts and probabilities using the probability tree approach
prob_dict = {}   # Initialize probability dictionary
categories = list(unique_values_dict.keys())
calculate_probabilities(train_set, unique_values_dict, prob_dict=prob_dict)

In [242]:
def format_condition(condition):
    return '^'.join([f"{col[0].capitalize()}{str(val)[0]}" for col, val in condition])

def print_dict_as_tree(dictionary):
    for key, value in dictionary.items():
        if isinstance(key, tuple):
            formatted_key = format_condition(key)
        else:
            formatted_key = f"Key: {key}"

        if isinstance(value, dict):
            print(formatted_key + ":")
            print_dict_as_tree(value)
        else:
            print(f"P({formatted_key}) = {value}")

print_dict_as_tree(prob_dict)

Key: 0:
P(AS) = 0.2029836145756909
P(AA) = 0.6255808266079727
P(AC) = 0.17143555881633651
Key: 1:
P(AS^AH) = 0.3301204819277108
P(AS^AN) = 0.40963855421686746
P(AS^AL) = 0.26024096385542167
P(AA^AH) = 0.19937451133698203
P(AA^AN) = 0.49335418295543393
P(AA^AL) = 0.30727130570758404
P(AC^AH) = 0.15406562054208273
P(AC^AN) = 0.5278174037089871
P(AC^AL) = 0.3181169757489301
Key: 2:
P(AS^AH^BO) = 0.8138686131386861
P(AS^AH^BU) = 0.07664233576642336
P(AS^AH^BN) = 0.10218978102189781
P(AS^AH^BU) = 0.0072992700729927005
P(AS^AN^BO) = 0.7147058823529412
P(AS^AN^BU) = 0.07058823529411765
P(AS^AN^BN) = 0.19117647058823528
P(AS^AN^BU) = 0.023529411764705882
P(AS^AL^BO) = 0.7314814814814815
P(AS^AL^BU) = 0.08333333333333333
P(AS^AL^BN) = 0.17592592592592593
P(AS^AL^BU) = 0.009259259259259259
P(AA^AH^BO) = 0.7666666666666667
P(AA^AH^BU) = 0.056862745098039215
P(AA^AH^BN) = 0.17058823529411765
P(AA^AH^BU) = 0.0058823529411764705
P(AA^AN^BO) = 0.7329635499207607
P(AA^AN^BU) = 0.030903328050713153
P(A