In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df_train = pd.read_csv('/content/testing_data.csv')
df_test = pd.read_csv('/content/training_data.csv')

# Approach/Strategy

I am going to begin with the categorical variables and run a decision tree on them to see what groups to bin people into. With those bins, I will then run a linear regression with the numerical variables.

In [4]:
y_train = df_train['stroke']
X_train = df_train.drop('stroke',axis=1)
y_test = df_test['stroke']
X_test = df_test.drop('stroke',axis=1)

X_train['bmi'] = X_train['bmi'].fillna(X_train['bmi'].mean())
X_test['bmi'] = X_test['bmi'].fillna(X_test['bmi'].mean())

In [5]:
#Get the categorical variables
X_train_categ = X_train.loc[:,['ever_married','gender','Residence_type','smoking_status','work_type'] ]
X_test_categ = X_test.loc[:,['ever_married','gender','Residence_type','smoking_status','work_type'] ]

In [7]:
#One-hot encoding for categorical variables
from sklearn.preprocessing import LabelBinarizer
one_hot_encoder = LabelBinarizer()

In [26]:
Married_train = pd.get_dummies(X_train_categ['ever_married'], dtype='int')
MarriedDF_train = Married_train.rename(columns={'No': 'ever_married_Yes', 'Yes': 'ever_married_No'})
MarriedDF_train.head()

Unnamed: 0,ever_married_Yes,ever_married_No
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1


In [25]:
Married_test = pd.get_dummies(X_test_categ['ever_married'], dtype='int')
MarriedDF_test = Married_test.rename(columns={'No': 'ever_married_Yes', 'Yes': 'ever_married_No'})
MarriedDF_test.head()

Unnamed: 0,ever_married_Yes,ever_married_No
0,0,1
1,0,1
2,1,0
3,0,1
4,1,0


In [27]:
GenderDF_train = pd.get_dummies(X_train_categ['gender'], dtype='int')

In [28]:
GenderDF_test = pd.get_dummies(X_test_categ['gender'], dtype='int')

In [29]:
ResidenceDF_train = pd.get_dummies(X_train_categ['Residence_type'], dtype='int')
ResidenceDF_train.head()

Unnamed: 0,Rural,Urban
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1


In [30]:
ResidenceDF_test = pd.get_dummies(X_test_categ['Residence_type'], dtype='int')
ResidenceDF_test.head()

Unnamed: 0,Rural,Urban
0,0,1
1,1,0
2,1,0
3,0,1
4,0,1


In [31]:
Smoking_train = pd.get_dummies(X_train_categ['smoking_status'],dtype='int')
oh = one_hot_encoder.fit_transform(X_train_categ['smoking_status']) # One-hot encode the 'bird' variable
names = one_hot_encoder.classes_ # Get names of the classes
SmokingDF_train = pd.DataFrame( data = oh, columns = names) # Create a new df for the encoded variables
SmokingDF_train.drop(columns='Unknown', inplace=True)
SmokingDF_train.head()

Unnamed: 0,formerly smoked,never smoked,smokes
0,0,1,0
1,0,0,0
2,1,0,0
3,0,0,0
4,0,1,0


In [32]:
Smoking_test = pd.get_dummies(X_test_categ['smoking_status'],dtype='int')
oh = one_hot_encoder.fit_transform(X_test_categ['smoking_status']) # One-hot encode the 'bird' variable
names = one_hot_encoder.classes_ # Get names of the classes
SmokingDF_test = pd.DataFrame( data = oh, columns = names) # Create a new df for the encoded variables
SmokingDF_test.drop(columns='Unknown', inplace=True)
SmokingDF_test.head()

Unnamed: 0,formerly smoked,never smoked,smokes
0,1,0,0
1,0,1,0
2,0,0,0
3,0,1,0
4,0,0,0


In [33]:
Work_train = pd.get_dummies(X_train_categ['work_type'],dtype='int')
oh = one_hot_encoder.fit_transform(X_train_categ['work_type']) # One-hot encode the 'bird' variable
names = one_hot_encoder.classes_ # Get names of the classes
WorkDF_train = pd.DataFrame( data = oh, columns = names) # Create a new df for the encoded variables
WorkDF_train.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,0,1,0


In [34]:
Work_test = pd.get_dummies(X_test_categ['work_type'],dtype='int')
oh = one_hot_encoder.fit_transform(X_test_categ['work_type']) # One-hot encode the 'bird' variable
names = one_hot_encoder.classes_ # Get names of the classes
WorkDF_test = pd.DataFrame( data = oh, columns = names) # Create a new df for the encoded variables
WorkDF_test.head()

Unnamed: 0,Govt_job,Never_worked,Private,Self-employed,children
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,1,0,0


In [35]:
X_train_categ_oh = pd.concat([MarriedDF_train,GenderDF_train,ResidenceDF_train, SmokingDF_train, WorkDF_train],axis=1)
X_train_categ_oh.head()

Unnamed: 0,ever_married_Yes,ever_married_No,Female,Male,Rural,Urban,formerly smoked,never smoked,smokes,Govt_job,Never_worked,Private,Self-employed,children
0,0,1,0,1,0,1,0,1,0,0,0,1,0,0
1,1,0,1,0,0,1,0,0,0,0,0,1,0,0
2,0,1,1,0,1,0,1,0,0,0,0,1,0,0
3,0,1,1,0,0,1,0,0,0,0,0,1,0,0
4,0,1,1,0,0,1,0,1,0,0,0,0,1,0


In [36]:
X_test_categ_oh = pd.concat([MarriedDF_test,GenderDF_test,ResidenceDF_test, SmokingDF_test, WorkDF_test],axis=1)
X_test_categ_oh.head()

Unnamed: 0,ever_married_Yes,ever_married_No,Female,Male,Other,Rural,Urban,formerly smoked,never smoked,smokes,Govt_job,Never_worked,Private,Self-employed,children
0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0
1,0,1,1,0,0,1,0,0,1,0,1,0,0,0,0
2,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0
3,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0
4,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0
