In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\piotr\Downloads\preprocessed_data.csv",index_col=0)
df.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,never smoked,0


In [3]:
len(df)

5109

In [4]:
df_scoring = pd.DataFrame(columns=df.columns)
df_scoring

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke


In [5]:
import numpy as np
import pandas as pd

def add_random_rows(df: pd.DataFrame, n: int = 500) -> pd.DataFrame:

    # Tworzy nowy DataFrame z n wierszami, gdzie:
    #   - dla zmiennych dyskretnych (gender, hypertension, heart_disease,
    #     ever_married, work_type, Residence_type, smoking_status)
    #     losujemy kategorię zgodnie z proporcjami w oryginalnym df;
    #   - dla zmiennych ciągłych (age, avg_glucose_level, bmi)
    #     losujemy pojedynczą wartość z empirycznego rozkładu (sample z kolumny);
    #   - jeśli age > 20, to przy losowaniu work_type pomijamy wartość 'children';
    #   - kolumnę 'stroke' zostawiamy jako NaN.
    # Zwraca nowy DataFrame z wygenerowanymi wierszami.

    # 1. Obliczamy jednorazowo rozkłady prawdopodobieństw dla zmiennych dyskretnych
    gender_probs        = df['gender'].value_counts(normalize=True)
    hypertension_probs  = df['hypertension'].value_counts(normalize=True)
    heart_disease_probs = df['heart_disease'].value_counts(normalize=True)
    ever_married_probs  = df['ever_married'].value_counts(normalize=True)
    work_type_probs     = df['work_type'].value_counts(normalize=True)
    residence_probs     = df['Residence_type'].value_counts(normalize=True)
    smoking_probs       = df['smoking_status'].value_counts(normalize=True)

    new_rows = []
    for _ in range(n):
        # 2a. Losowanie wartości dyskretnych
        new_gender        = np.random.choice(gender_probs.index,        p=gender_probs.values)
        new_hypertension  = np.random.choice(hypertension_probs.index,  p=hypertension_probs.values)
        new_heart_disease = np.random.choice(heart_disease_probs.index, p=heart_disease_probs.values)
        new_ever_married  = np.random.choice(ever_married_probs.index,  p=ever_married_probs.values)
        new_residence     = np.random.choice(residence_probs.index,     p=residence_probs.values)
        new_smoking       = np.random.choice(smoking_probs.index,       p=smoking_probs.values)

        # 2b. Losowanie wartości ciągłych
        new_age      = df['age'].sample(n=1).iloc[0]
        new_glucose  = df['avg_glucose_level'].sample(n=1).iloc[0]
        new_bmi      = df['bmi'].sample(n=1).iloc[0]

        # 2c. work_type z uwzględnieniem warunku age > 20
        if new_age > 20:
            filtered = work_type_probs[work_type_probs.index != 'children']
            filtered = filtered / filtered.sum()  # normalizacja
            new_work_type = np.random.choice(filtered.index, p=filtered.values)
        else:
            new_work_type = np.random.choice(work_type_probs.index, p=work_type_probs.values)

        # 2d. Tworzenie słownika wiersza
        new_row = {
            'gender':            new_gender,
            'age':               new_age,
            'hypertension':      new_hypertension,
            'heart_disease':     new_heart_disease,
            'ever_married':      new_ever_married,
            'work_type':         new_work_type,
            'Residence_type':    new_residence,
            'avg_glucose_level': new_glucose,
            'bmi':               new_bmi,
            'smoking_status':    new_smoking,
            'stroke':            np.nan
        }
        new_rows.append(new_row)

    # 3. Zwracamy nowy DataFrame (nie rozszerzamy oryginalnego df)
    df_scoring = pd.DataFrame(new_rows)
    return df_scoring


In [6]:
df_scoring = add_random_rows(df, n=500)

In [7]:
df_scoring

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,69.0,0,0,No,Govt_job,Rural,87.69,32.300000,never smoked,
1,Male,18.0,0,0,No,Private,Urban,240.71,31.000000,never smoked,
2,Male,45.0,0,0,Yes,Private,Urban,60.70,31.200000,formerly smoked,
3,Female,52.0,0,0,Yes,Private,Rural,89.30,27.200000,never smoked,
4,Male,24.0,0,0,No,Self-employed,Urban,98.54,26.900000,formerly smoked,
...,...,...,...,...,...,...,...,...,...,...,...
495,Female,55.0,0,0,No,Self-employed,Urban,60.05,21.000000,never smoked,
496,Female,63.0,0,0,Yes,Govt_job,Urban,115.13,28.893237,never smoked,
497,Male,54.0,0,0,No,Self-employed,Urban,60.22,34.700000,never smoked,
498,Male,41.0,0,0,Yes,Private,Rural,74.51,22.200000,never smoked,


In [9]:

# Funkcja predykcji na podstawie drzewa decyzyjnego
def predict_stroke(sample):
    if sample['bmi'] <= 22.301:
        if sample['work_type'] != 'Self-employed':
            if sample['smoking_status'] != 'never smoked':
                if sample['hypertension'] == 0:
                    return "No Stroke"
                else:
                    return "Stroke"
            else:
                if sample['hypertension'] == 0:
                    return "No Stroke"
                else:
                    return "Stroke"
        else:
            if sample['ever_married'] == 'No':
                return "No Stroke"
            else:
                if sample['age'] > 78.019:
                    return "Stroke"
                else:
                    return "No Stroke"
    else:
        if sample['smoking_status'] != 'smokes':
            if sample['age'] <= 55.009:
                if sample['ever_married'] == 'No':
                    return "No Stroke"
                else:
                    return "No Stroke"
            else:
                if sample['ever_married'] == 'No':
                    return "Stroke"
                else:
                    return "Stroke"
        else:
            if sample['work_type'] != 'Private':
                if sample['hypertension'] == 0:
                    return "No Stroke"
                else:
                    return "No Stroke"
            else:
                if sample['avg_glucose_level'] <= 87.98:
                    return "No Stroke"
                else:
                    return "No Stroke"

df_scoring['stroke'] = predict_stroke

In [11]:
df_scoring['stroke'] = df_scoring.apply(predict_stroke, axis=1)
df_scoring

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,69.0,0,0,No,Govt_job,Rural,87.69,32.300000,never smoked,Stroke
1,Male,18.0,0,0,No,Private,Urban,240.71,31.000000,never smoked,No Stroke
2,Male,45.0,0,0,Yes,Private,Urban,60.70,31.200000,formerly smoked,No Stroke
3,Female,52.0,0,0,Yes,Private,Rural,89.30,27.200000,never smoked,No Stroke
4,Male,24.0,0,0,No,Self-employed,Urban,98.54,26.900000,formerly smoked,No Stroke
...,...,...,...,...,...,...,...,...,...,...,...
495,Female,55.0,0,0,No,Self-employed,Urban,60.05,21.000000,never smoked,No Stroke
496,Female,63.0,0,0,Yes,Govt_job,Urban,115.13,28.893237,never smoked,Stroke
497,Male,54.0,0,0,No,Self-employed,Urban,60.22,34.700000,never smoked,No Stroke
498,Male,41.0,0,0,Yes,Private,Rural,74.51,22.200000,never smoked,No Stroke


In [12]:
print(df_scoring['stroke'].value_counts())

stroke
No Stroke    386
Stroke       114
Name: count, dtype: int64
