# Exercise 1 B

## Import Necessary Libraries

In [760]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
print("Setup Complete")

Setup Complete


In [761]:
# Path of the file to read
stroke_filepath = "../datasets/healthcare-dataset-stroke-data.csv"

# Read the file into a variable stroke_data
stroke_data = pd.read_csv(stroke_filepath, index_col="id")

# Set seed for reproducibility
np.random.seed(0)

## Handling Missing Values

In [762]:
# Check columns with missing values
stroke_data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [763]:
# Replace "Unknown" smoking_status with NaN value
stroke_data_smoking_nan = stroke_data.replace(["Unknown"], np.nan)

In [764]:
# Check columns with missing values
stroke_data_smoking_nan.isnull().sum()

gender                  0
age                     0
hypertension            0
heart_disease           0
ever_married            0
work_type               0
Residence_type          0
avg_glucose_level       0
bmi                   201
smoking_status       1544
stroke                  0
dtype: int64

### Drop Columns With Missing Values

In [765]:
# Get names of columns with missing values
cols_with_missing = [col for col in stroke_data_smoking_nan.columns if stroke_data_smoking_nan[col].isnull().any()]

# Drop columns in data
dropped_columns_data = stroke_data_smoking_nan.drop(cols_with_missing, axis=1)

In [766]:
# Check columns with missing values
dropped_columns_data.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
stroke               0
dtype: int64

In [767]:
# Label encoder for data
def label_encoder(y, X):
    # Divide data into training and validation subsets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=0)

    # Get list of categorical variables
    s = (X_train.dtypes == 'object')
    object_cols = list(s[s].index)


    # Apply label encoder to each column with categorical data
    label_encoder = LabelEncoder()
    for col in object_cols:
        X_train[col] = label_encoder.fit_transform(X_train[col])
        X_valid[col] = label_encoder.transform(X_valid[col])
        
    return X_train, X_valid, y_train, y_valid

In [768]:
# Separate target from predictors
y = dropped_columns_data["stroke"]
X = dropped_columns_data.drop(["stroke"], axis=1)

X_train, X_valid, y_train, y_valid = label_encoder(y, X)

# Create a Classifier
clf = RandomForestClassifier(random_state=1)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = label_encoder.fit_transform(X_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid[col] = label_encoder.transform(X_valid[col])


              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1213
           1       0.30      0.05      0.08        65

    accuracy                           0.95      1278
   macro avg       0.63      0.52      0.53      1278
weighted avg       0.92      0.95      0.93      1278



### Fill Missing Values With Column Mean

In [769]:
fill_mean_data = stroke_data.fillna(stroke_data.mean())

In [770]:
fill_mean_data

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
18234,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


In [771]:
# Separate target from predictors
y = fill_mean_data["stroke"]
X = fill_mean_data.drop(["stroke", "smoking_status"], axis=1)

X_train, X_valid, y_train, y_valid = label_encoder(y, X)

# Create a Classifier
clf = RandomForestClassifier(random_state=1)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = label_encoder.fit_transform(X_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid[col] = label_encoder.transform(X_valid[col])


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1213
           1       1.00      0.02      0.03        65

    accuracy                           0.95      1278
   macro avg       0.97      0.51      0.50      1278
weighted avg       0.95      0.95      0.93      1278



### Fill Missing Values With Linear Regression

In [772]:
# Find the correlation between variables
df = stroke_data
corr = df.corr()
corr

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
age,1.0,0.276398,0.263796,0.238171,0.333398,0.245257
hypertension,0.276398,1.0,0.108306,0.174474,0.167811,0.127904
heart_disease,0.263796,0.108306,1.0,0.161857,0.041357,0.134914
avg_glucose_level,0.238171,0.174474,0.161857,1.0,0.175502,0.131945
bmi,0.333398,0.167811,0.041357,0.175502,1.0,0.042374
stroke,0.245257,0.127904,0.134914,0.131945,0.042374,1.0


In [773]:
# Delete rows with missing values in bmi and age columns
df_bmi_age = df.dropna(axis=0, subset=['bmi', 'age'])
# Keep only bmi and age columns
df_bmi_age = df_bmi_age.loc[:, ['bmi', 'age']]

# Find the NaN entries in bmi column
missing_bmi = df['bmi'].isnull()
# Keep only the NaN entries in the bmi column
age_miss_bmi = pd.DataFrame(df['age'][missing_bmi])

In [774]:
# Separate target from predictors
X = df_bmi_age[["age"]]
y = df_bmi_age["bmi"]

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Create a Linear Regression Model
lrm = LinearRegression()

# Train the model
lrm.fit(X_train, y_train)
bmi_preds = lrm.predict(age_miss_bmi)

# Replace all missing values with the predicted ones
nans = df[pd.isnull(df['bmi'])]
for i, ni in enumerate(nans.index[:len(bmi_preds)]):
    df['bmi'].loc[ni] = bmi_preds[i]


lin_reg_data = df
lrm.score(X,y)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


0.11113639937547126

In [775]:
# Separate target from predictors
y = lin_reg_data["stroke"]
X = lin_reg_data.drop(["stroke", "smoking_status"], axis=1)

X_train, X_valid, y_train, y_valid = label_encoder(y, X)

# Create a Classifier
clf = RandomForestClassifier(random_state=1)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds, zero_division=1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = label_encoder.fit_transform(X_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid[col] = label_encoder.transform(X_valid[col])


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1213
           1       1.00      0.02      0.03        65

    accuracy                           0.95      1278
   macro avg       0.97      0.51      0.50      1278
weighted avg       0.95      0.95      0.93      1278



In [776]:
df2 = stroke_data['bmi'].interpolate()
df3 = stroke_data
df3['bmi'] = df2

# Separate target from predictors
y = df3["stroke"]
X = df3.drop(["stroke", "smoking_status"], axis=1)

X_train, X_valid, y_train, y_valid = label_encoder(y, X)

# Create a Classifier
clf = RandomForestClassifier(random_state=1)

# Train the model
clf.fit(X_train, y_train)
stroke_preds = clf.predict(X_valid)

print(classification_report(y_valid, stroke_preds))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = label_encoder.fit_transform(X_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid[col] = label_encoder.transform(X_valid[col])


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1213
           1       1.00      0.02      0.03        65

    accuracy                           0.95      1278
   macro avg       0.97      0.51      0.50      1278
weighted avg       0.95      0.95      0.93      1278



### Fill Missing Values With k-Nearest Neighbors

In [777]:
stroke_data_smoking_nan

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
