# Reading dataset

Original dataset from kaggle <a href="https://www.kaggle.com/fedesoriano/stroke-prediction-dataset">Download from here</a>

The dataset was modified to have balanced outputs and saved as `stroke-balanced.csv`

In [1]:
import pandas as pd

data = pd.read_csv('stroke-balanced.csv')

data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,44912,Male,12.0,0,0,No,children,Urban,67.06,16.1,Unknown,0
1,66972,Female,52.0,0,0,Yes,Govt_job,Urban,80.88,23.8,smokes,0
2,1451,Female,17.0,0,0,No,Private,Urban,78.46,23.5,Unknown,0
3,49797,Female,28.0,0,0,No,Private,Rural,75.53,34.9,never smoked,0
4,70241,Female,22.0,0,0,No,Private,Urban,66.29,20.5,smokes,0


In [2]:
data['age'].mean().groupby('gender')

AttributeError: 'float' object has no attribute 'groupby'

In [3]:
data['age'].mean(groupby='gender')

TypeError: mean() got an unexpected keyword argument 'groupby'

In [4]:
data.groupby('gender')['age'].mean()

gender
Female    55.343214
Male      55.284495
Name: age, dtype: float64

In [5]:
data.groupby('gender').mean('age')

Unnamed: 0_level_0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,38529.825,55.343214,0.185714,0.078571,110.717732,29.408171,0.4875
Male,36977.896789,55.284495,0.178899,0.18578,125.529518,29.790609,0.516055


# Handling missing values

In [2]:
# TODO: create a mask of rows with missing data
mask = data.isnull().any(axis=1)
# TODO: calculate number of rows with missing data
num_of_rows_with_nan = mask.sum()
 # TODO: print the ratio of rows with missing data
print('the ratio of rows with missing data:', num_of_rows_with_nan/len(data))

the ratio of rows with missing data: 0.08835341365461848


**Note:** The previous output must be 
```
0.08835341365461848
```

In [3]:
# TODO: remove rows with missing data
data_clean = data[~mask]

# Input and output

In [4]:
# TODO: Select all columns from `data_clean` except `id` and `stroke`
data_input = data_clean.drop(columns=['id','stroke'])
# TODO: Select `stroke` column from `data_clean`
data_output = data_clean['stroke']

# Handling categorical data
## Numeric encoding

In [5]:
# TODO: print data types of `data_input`
data_input.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
dtype: object

In [7]:
# TODO: print unique values of categorical features
print('gender',data_input['gender'].unique())
print('ever_married',data_input['ever_married'].unique())
print('work_type',data_input['work_type'].unique())
print('Residence_type',data_input['Residence_type'].unique())
print('smoking_status',data_input['smoking_status'].unique())

gender ['Male' 'Female']
ever_married ['No' 'Yes']
work_type ['children' 'Govt_job' 'Private' 'Self-employed' 'Never_worked']
Residence_type ['Urban' 'Rural']
smoking_status ['Unknown' 'smokes' 'never smoked' 'formerly smoked']


In [8]:
data_input_encoded_1 = data_input.replace({
    # TODO: use numeric encoding to encode `gender`, `ever_married`, and `Residence_type`
    'gender':{'Male':0, 'Female':1},
    'ever_married': {'No':0, 'Yes':1},
    'Residence_type': {'Urban':0, 'Rural':1}
})

In [9]:
data_input_encoded_1.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,12.0,0,0,0,children,0,67.06,16.1,Unknown
1,1,52.0,0,0,1,Govt_job,0,80.88,23.8,smokes
2,1,17.0,0,0,0,Private,0,78.46,23.5,Unknown
3,1,28.0,0,0,0,Private,1,75.53,34.9,never smoked
4,1,22.0,0,0,0,Private,0,66.29,20.5,smokes


**Note:** `data_input_encoded_1` must have numeric values for `gender`, `ever_married`, and `Residence_type`

## One-hot encoding

In [10]:
# TODO: apply one-hot encoding to data_input_encoded_1
data_input_encoded_2 = pd.get_dummies(data_input_encoded_1)

In [12]:
data_input_encoded_2.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,12.0,0,0,0,0,67.06,16.1,0,0,0,0,1,1,0,0,0
1,1,52.0,0,0,1,0,80.88,23.8,1,0,0,0,0,0,0,0,1
2,1,17.0,0,0,0,0,78.46,23.5,0,0,1,0,0,1,0,0,0
3,1,28.0,0,0,0,1,75.53,34.9,0,0,1,0,0,0,0,1,0
4,1,22.0,0,0,0,0,66.29,20.5,0,0,1,0,0,0,0,0,1


**Note:** `data_input_encoded_2` must have numeric values for **all features**.

# Split into (train - validation - test)

In [13]:
from sklearn.model_selection import train_test_split

X, X_test, y, y_test = train_test_split(
    # TODO: split (data_input_encoded_2, data_output) using test_size=0.20 and random_state=0
data_input_encoded_2, data_output, test_size=0.20, random_state=0

)

X_train, X_val, y_train, y_val = train_test_split(
    # TODO: split (X, y) using test_size=0.25 and random_state=0
X, y, test_size=0.25, random_state=0

)

In [14]:
print(X_train.shape)
print(y_train.shape)
print('---------------------')
print(X_val.shape)
print(y_val.shape)
print('---------------------')
print(X_test.shape)
print(y_test.shape)

(544, 17)
(544,)
---------------------
(182, 17)
(182,)
---------------------
(182, 17)
(182,)


**Note:** Previous output must be like the following:
```
(544, 17)
(544,)
---------------------
(182, 17)
(182,)
---------------------
(182, 17)
(182,)
```

# Feature scaling (Normalization)

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # TODO: create an object of `MinMaxScaler`

# TODO: fit the `scaler` using `X_train`
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train) # TODO: transform `X_train` using the `scaler`
X_val_scaled =  scaler.transform(X_val) # TODO: transform `X_val` using the `scaler`
X_test_scaled =  scaler.transform(X_test) # TODO: transform `X_test` using the `scaler`

# Logistic Regression
## Training and Validation

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [17]:
logistic = LogisticRegression()
logistic.fit(X_train_scaled, y_train)

y_pred_train = logistic.predict(X_train_scaled)
y_pred_val = logistic.predict(X_val_scaled)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_val, y_pred_val))

0.7849264705882353
0.7912087912087912


## Testing

In [18]:
y_pred_test = logistic.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred_test))

0.7692307692307693
