In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Load dataset
df = pd.read_csv('dataset.csv')

# Drop unnecessary columns
df.drop('id', axis=1, inplace=True)

# Summary of data
df.describe(include='all')

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,43400,43400.0,43400.0,43400.0,43400,43400,43400,43400.0,41938.0,30108,43400.0
unique,3,,,,2,5,2,,,3,
top,Female,,,,Yes,Private,Urban,,,never smoked,
freq,25665,,,,27938,24834,21756,,,16053,
mean,,42.217894,0.093571,0.047512,,,,104.48275,28.605038,,0.018041
std,,22.519649,0.291235,0.212733,,,,43.111751,7.77002,,0.133103
min,,0.08,0.0,0.0,,,,55.0,10.1,,0.0
25%,,24.0,0.0,0.0,,,,77.54,23.2,,0.0
50%,,44.0,0.0,0.0,,,,91.58,27.7,,0.0
75%,,60.0,0.0,0.0,,,,112.07,32.9,,0.0


In [6]:
display(df.head())
display(df.info())
print("Missing values by column:\n", df.isna().sum())
print("Target distribuition:\n", df['stroke'].value_counts(normalize=True))

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             43400 non-null  object 
 1   age                43400 non-null  float64
 2   hypertension       43400 non-null  int64  
 3   heart_disease      43400 non-null  int64  
 4   ever_married       43400 non-null  object 
 5   work_type          43400 non-null  object 
 6   Residence_type     43400 non-null  object 
 7   avg_glucose_level  43400 non-null  float64
 8   bmi                41938 non-null  float64
 9   smoking_status     30108 non-null  object 
 10  stroke             43400 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 3.6+ MB


None

Missing values by column:
 gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64
Target distribuition:
 stroke
0    0.981959
1    0.018041
Name: proportion, dtype: float64


In [7]:
# Split data: training (80%) test (20%) sets
x = df.drop('stroke', axis=1)
y = df['stroke']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
# Define feature types
numerical_features = ['age', 'bmi', 'avg_glucose_level']
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type']
ordinal_features = ['smoking_status']
binary_features = ['hypertension', 'heart_disease']


In [9]:
# Order for ordinal features
smoking_order = ['never smoked', 'formerly smoked', 'smokes']

# Numeric pipeline (median imputation)
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline (most frequent imputation + one-hot encoding)
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('scaler', StandardScaler())
])

# Ordinal pipeline (most frequent imputation + ordinal encoding)
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('ordinal', OrdinalEncoder(categories=[smoking_order], handle_unknown='use_encoded_value', unknown_value=-1))
])


In [10]:
# Combine the pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
        ('ord', ordinal_pipeline, ordinal_features),
        ('bin', 'passthrough', binary_features)
    ],
    remainder='drop'
)

# Done — preprocessor ready for model pipeline
preprocessor


In [11]:
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', svm_model)
])

print("Initiating Cross Validation")

scores = cross_val_score(svm_pipeline, x_train, y_train, cv=5, scoring='f1')

print(f"Scores individuais: {scores}")
print(f"Média F1-Score: {scores.mean():.4f}")
print(f"Desvio Padrão: {scores.std():.4f}")

Initiating Cross Validation
Scores individuais: [0.09595202 0.0917999  0.09292721 0.09612555 0.10309278]
Média F1-Score: 0.0960
Desvio Padrão: 0.0039


In [12]:
random_forest_model = RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', random_forest_model)
])
scores = cross_val_score(rf_pipeline, x_train, y_train, cv=5, scoring='f1')

print(f"Scores individuais: {scores}")
print(f"Média F1-Score: {scores.mean():.4f}")

Scores individuais: [0.       0.015625 0.       0.       0.      ]
Média F1-Score: 0.0031
