<a href="https://colab.research.google.com/github/CrAvila/IA/blob/main/Taller1/IA_Taller_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [119]:
!pip install pandas
!pip install numpy
!pip install matplotlib



In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [121]:
data = pd.read_csv('drive/MyDrive/colab/healthcare-dataset-stroke-data.csv')

variables_to_use = [
    'age',
    'avg_glucose_level',
    'bmi',
    'hypertension',
    'heart_disease',
]

In [122]:
display(data)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [123]:
events = {
    "quantitative" : {

        "age": {
            "ranges" : [
                (0,18),
                (18,65),
                (65, float('inf'))
            ],

            "categories" : [
                'Child',
                'Adult',
                'Senior'
            ]
        },

        "avg_glucose_level": {
            "ranges" : [
                (0,80),
                (80,120),
                (120, float('inf'))
            ],

            "categories" : [
                'Low',
                'Normal',
                'High'
            ]
        },

        "bmi": {
            "ranges": [
                (0,18.5),
                (18.5, 25),
                (25, float('inf'))
            ],

            "categories": [
                'Underweight',
                'Normal',
                'Overweight',
                'Unknown'
            ]
        }
    },

    "qualitative" :  {
        "gender" : data['gender'].unique().tolist(),
        "work_type": data['work_type'].unique().tolist(),
        "Residence_type" : data['Residence_type'].unique().tolist(),
        "smoking_status" : data['smoking_status'].unique().tolist()
    },

    "boolean" : {
        "hypertension" : {
            "statuses" : [0,1]
        },

        "heart_disease" : {
            "statuses" : [0,1]
        },

        "ever_married" : {
            "statuses" : [0,1]
        } ,

        "stroke" : {
            "statuses" : [0,1]
        }
    }
}

In [126]:
#Function to process original data
def process_data(df, variables_to_use):

    processed_dataframe = df.copy()

    #Classify que quantitative values
    for quantitative_column in list(events['quantitative'].keys()):

        #Specific function to classify the column value
        def categorize(value):
            for i, (lower, upper) in enumerate(events["quantitative"][quantitative_column]["ranges"]):
                if lower <= value < upper:
                    return events["quantitative"][quantitative_column]["categories"][i]

        # Replace value into the processed dataframe
        processed_dataframe[quantitative_column] = processed_dataframe[quantitative_column].apply(categorize)

    replace_mapping = {
        "No" : 0,
        "Yes" : 1
    }

    processed_dataframe['ever_married'] = processed_dataframe['ever_married'].replace(replace_mapping)

    new_columns = [col for col in variables_to_use]
    new_columns.append('stroke')

    cols_to_drop = [col for col in data.columns.values.tolist() if col not in new_columns]

    processed_dataframe = processed_dataframe.drop(columns=cols_to_drop)
    processed_dataframe.fillna('Unknown', inplace=True)

    processed_dataframe = processed_dataframe[new_columns]

    return processed_dataframe

In [127]:
processed_data = process_data(data, variables_to_use)

display(processed_data)

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
0,Senior,High,Overweight,0,1,1
1,Adult,High,Unknown,0,0,1
2,Senior,Normal,Overweight,0,1,1
3,Adult,High,Overweight,0,0,1
4,Senior,High,Normal,1,0,1
...,...,...,...,...,...,...
5105,Senior,Normal,Unknown,1,0,0
5106,Senior,High,Overweight,0,0,0
5107,Adult,Normal,Overweight,0,0,0
5108,Adult,High,Overweight,0,0,0


In [132]:
def split_data(df, test_percentage):
    # Separate data into stroke and no stroke groups
    stroke_group = df[df['stroke'] == 1]
    no_stroke_group = df[df['stroke'] == 0]

    # Shuffle records in both groups
    stroke_group_shuffled = stroke_group.sample(frac=1, random_state=42).reset_index(drop=True)
    no_stroke_group_shuffled = no_stroke_group.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate the number of records needed for 20 %
    num_test_stroke = int(test_percentage * len(stroke_group_shuffled))
    num_test_no_stroke = int(test_percentage * len(no_stroke_group_shuffled))

    # Create test and training sets for the no stroke group
    test_set_no_stroke = no_stroke_group_shuffled[:num_test_no_stroke]
    train_set_no_stroke = no_stroke_group_shuffled[num_test_no_stroke:]

    # Create test and training set for the stroke group
    test_set_stroke = stroke_group_shuffled[:num_test_stroke]
    train_set_stroke = stroke_group_shuffled[num_test_stroke:]

    # Combine test sets
    final_test_set = pd.concat([test_set_stroke, test_set_no_stroke])

    # Combine train sets
    final_train_set = pd.concat([train_set_stroke, train_set_no_stroke])

    return final_test_set, final_train_set

In [133]:
test_set, train_set = split_data(processed_data, 0.2)

In [134]:
display(test_set)
display(train_set)

Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
0,Senior,High,Overweight,0,0,1
1,Senior,Low,Overweight,1,1,1
2,Adult,High,Overweight,0,1,1
3,Adult,High,Overweight,0,0,1
4,Senior,Normal,Unknown,0,0,1
...,...,...,...,...,...,...
967,Adult,Normal,Overweight,0,0,0
968,Child,Normal,Normal,0,0,0
969,Adult,Normal,Underweight,0,0,0
970,Adult,High,Overweight,0,0,0


Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke
49,Adult,Normal,Overweight,0,0,1
50,Senior,High,Overweight,1,0,1
51,Adult,Normal,Overweight,0,0,1
52,Adult,Low,Normal,0,0,1
53,Senior,Normal,Overweight,0,0,1
...,...,...,...,...,...,...
4856,Child,Low,Normal,0,0,0
4857,Adult,Normal,Overweight,1,0,0
4858,Adult,High,Overweight,0,0,0
4859,Child,Normal,Underweight,0,0,0


In [135]:
def get_count

def make_tree(df):

    total_samples = len(df)

    for event in df.columns.values.tolist():



        print(event)


In [136]:
make_tree(train_set)

age
avg_glucose_level
bmi
hypertension
heart_disease
stroke
