# Machine Learning Algorithm Implementation
Yale Machine Learning CPSC 581

In [3]:
#General Libraries
import numpy as np
import pandas as pd 

# Statistic & Machine Learning Libraries
# import statsmodels.api as sm

# from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
# import statsmodels.formula.api as smf
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn import tree
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from warnings import filterwarnings
filterwarnings('ignore')

In [5]:
dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [6]:
df = dataset.copy()

df = df.dropna() # Missing observation is removed.
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [7]:
y = df["stroke"]
X = df.drop(["stroke"], axis = 1)

Preprocess the data. Using the same form as what Burak Kahveci did.
Quote: https://www.kaggle.com/burakkahveci/stroke-risk-prediction-with-machine-learning/notebook#My-Social-Media-Accounts

In [8]:
X['smoking_status'] = X['smoking_status'].replace({'formerly smoked' or 'smokes':'smoked','never smoked' or 'Unknown':'non_smoking'})
#4 Status downgraded to Status 2.
X['smoking_status'] = [1 if i.strip() == 'smoked' else 0 for i in X.smoking_status]
X['gender'] = [1 if i.strip() == 'Male' else 0 for i in X.gender]
X['ever_married'] = [1 if i.strip() == 'Yes' else 0 for i in X.ever_married]
X['Residence_type'] = [1 if i.strip() == 'Urban' else 0 for i in X.Residence_type]

X = X.drop(["work_type"], axis=1)
X = X.drop(["id"], axis=1)

X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.0,0,1,1,1,228.69,36.6,1
2,1,80.0,0,1,1,0,105.92,32.5,0
3,0,49.0,0,0,1,1,171.23,34.4,0
4,0,79.0,1,0,1,0,174.12,24.0,0
5,1,81.0,0,0,1,1,186.21,29.0,1


## Implementation of Logistic Regression with l2 regulization

I read the implementation of logistic regression in this article
https://www.geeksforgeeks.org/implementation-of-logistic-regression-from-scratch-using-python/
But I choose to add a column of 1s and add a dimention to weight so that I can use the form "w . x" instead of "w . x + b"
Also, I add a regulization each step when update the weight w

In [85]:
class LogitRegression() :
    def __init__( self, learning_rate, iterations, reg = 1) :        
        self.learning_rate = learning_rate        
        self.iterations = iterations
        self.reg = reg
          
    # Function for model training    
    def fit( self, X, Y ) :        
        # m is num_of_training_examples, n is num_of_features        
        self.m, self.n = X.shape        
        # weight initialization        
        self.W = np.zeros(self.n+1)        
#         self.b = 0        
        self.X = X.copy()
        self.X['C'] = 1
        self.Y = Y
#         print(self.X)
        # gradient descent learning
                  
        for i in range( self.iterations ) :            
            self.update_weights()            
        return self
      
    # Helper function to update weights in gradient descent
      
    def update_weights( self ) :

#         A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )

        A = 1 / ( 1 + np.exp( -self.X.dot( self.W ) ) )
        # calculate gradients        
        tmp = ( A - self.Y.T )

        tmp = np.reshape(tmp, self.m)


        dW = np.dot( self.X.T, tmp ) / self.m + self.reg / self.m
        db = np.sum( tmp ) / self.m 
          
        # update weights
        self.W = self.W - self.learning_rate * dW 
        
#         self.b = self.b - self.learning_rate * db
        
        return self
      
    # Hypothetical function  h( x ) 
      
    def predict( self, X ) :    
#         Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )     
        X = X.copy()
        
        X["C"] = 1
        Z = 1 / ( 1 + np.exp( - X.dot( self.W )  ) )        

        Y = np.where( Z > 0.5, 1, 0 )       
        return Y

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [82]:
model = LogitRegression( learning_rate = 0.001, iterations = 100 )
model.fit( X_train, y_train )
accuracy_score(y_test,model.predict(X_test))

0.9460285132382892

In [16]:
loj = LogisticRegression(solver = "liblinear")
loj_model = loj.fit(X_train,y_train)
loj_model

testscore_lr =accuracy_score(y_test, loj_model.predict(X_test))
accuracy_score(y_test, loj_model.predict(X_test)) 

0.9460285132382892

K-nearest neighbors

In [95]:
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train, y_train)

accuracy_score(y_test, knn_model.predict(X_test))

0.9389002036659878

Support Vector Machine

In [73]:
svm_model = SVC(kernel = "linear").fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

accuracy_score(y_test, y_pred) 

0.9460285132382892

Gradient Boosting Machines

In [72]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

y_pred = gbm_model.predict(X_test)
testscore_gbm=accuracy_score(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.9439918533604889

## Conclusion on Original Dataset:
On the original dataset, all approaches have a impressive performance.
It looks so succefully but I find the reason behind this false prosperity.
It was because that the original dataset itself is a biased dataset.
To allievate the bias of the dataset, I use Synthetic Minority Oversampling Technique (SMOTE).

Train on Over-Sampling training set (X_resample, y_resample)

In [4]:
from imblearn.over_sampling import SMOTE                                  

In [10]:
smote = SMOTE()
X_resample, y_resample = smote.fit_resample(X_train, y_train.ravel())

In [100]:
model = LogitRegression( learning_rate = 0.001, iterations = 20000 )

model.fit(X_resample, y_resample)
accuracy_score(y_test, model.predict(X_test))

0.8859470468431772

In [96]:
knn = KNeighborsClassifier()
knn_model = knn.fit(X_resample, y_resample)

accuracy_score(y_test, knn_model.predict(X_test))

0.8004073319755601

In [98]:
gbm_model = GradientBoostingClassifier().fit(X_resample, y_resample)

accuracy_score(y_test,gbm_model.predict(X_test))

0.8065173116089613

In [103]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_jobs=1).fit(X_resample, y_resample)

accuracy_score(y_test,xgb_model.predict(X_test))



0.9144602851323829

## Good test result on logistic regression
How does your result compare to state-of-the-art results for that problem or dataset?
The state-of-the-art is Extreme Gradient Boosting, which gained huge popularity during lots of data science competition. XGBoost scored 91.4% accuracy, slightly better than my approach. I am trying to implement XGBoost from sratch but I have not completed it in this project because of limitation of time.

I tried many algorithms including Naive Bayes, Random Forest, SVM (Implemented in PS2) and selected three of them according to their performance
I compared my result with approaches including K-Neighbors and gradient boost. My approach using logistic regression still did a better job than them, 88.6% vs around 80%. Nevertheless, it is possible that KNN and Gradient boost will get a better performance if they are finely tuned.

The formulation of boost algorithm is different. Boost algorithm including gradient boost is a method of converting a set of weak learners into a strong learner. The performance would be great if the family of weak learners have a very small even no correlation between them. 